From 5e74a5c86aacdff34011aca5c6ddf25f0c71c37f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Tue, 20 Feb 2024 15:34:11 +0100 Subject: [PATCH 001/351] [GlobalIsel] Combine ADDE Clang has them as builtins (__builtin_addc). The middle end has no intrinsics for them. They are used in legalization operations. AArch64: ADCS Add with carry and set flags On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 10 +- .../CodeGen/GlobalISel/GenericMachineInstrs.h | 17 + .../include/llvm/Target/GlobalISel/Combine.td | 8 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 209 ++ .../AArch64/GlobalISel/combine-adde.mir | 300 +++ llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll | 48 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 1745 ++++++++++------- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 975 ++++++--- 8 files changed, 2335 insertions(+), 977 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 23728636498ba..abc2ebdfd878c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -810,12 +810,15 @@ class CombinerHelper { /// Combine selects. bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine ands, + /// Combine ands. bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine ors, + /// Combine ors. bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Combine addes. + bool matchAddCarryInOut(MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; @@ -919,6 +922,7 @@ class CombinerHelper { bool isZeroOrZeroSplat(Register Src, bool AllowUndefs); bool isConstantSplatVector(Register Src, int64_t SplatValue, bool AllowUndefs); + bool isConstantOrConstantVectorI(Register Src); std::optional getConstantOrConstantSplatVector(Register Src); @@ -930,6 +934,8 @@ class CombinerHelper { // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y. bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo); + + bool isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const; }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index f5a6528d10a97..e46d2d1aac0e8 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr { Register getCarryOutReg() const { return getReg(1); } MachineOperand &getLHS() { return getOperand(2); } MachineOperand &getRHS() { return getOperand(3); } + Register getLHSReg() { return getOperand(2).getReg(); } + Register getRHSReg() { return getOperand(3).getReg(); } static bool classof(const MachineInstr *MI) { switch (MI->getOpcode()) { @@ -448,6 +450,21 @@ class GAddSubCarryInOut : public GAddSubCarryOut { } }; +/// Represents overflowing add operations that also consume a carry-in. +/// G_UADDE, G_SADDE +class GAddCarryInOut : public GAddSubCarryInOut { +public: + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_UADDE: + case TargetOpcode::G_SADDE: + return true; + default: + return false; + } + } +}; + /// Represents a call to an intrinsic. class GIntrinsic final : public GenericMachineInstr { public: diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 7eadb718f1641..3a82bc14885be 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1253,6 +1253,12 @@ def match_ors : GICombineRule< [{ return Helper.matchOr(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def match_addes : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SADDE, G_UADDE):$root, + [{ return Helper.matchAddCarryInOut(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + // Combines concat operations def concat_matchinfo : GIDefMatchData<"SmallVector">; def combine_concat_vector : GICombineRule< @@ -1335,7 +1341,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, redundant_binop_in_equality, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, - combine_concat_vector]>; + combine_concat_vector, match_addes]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 779ec49f4d13a..2cfc7387ed976 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6342,6 +6342,23 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) { return Value; } +bool CombinerHelper::isConstantOrConstantVectorI(Register Src) { + auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI); + if (IConstant) + return true; + GBuildVector *BuildVector = getOpcodeDef(Src, MRI); + if (!BuildVector) + return false; + unsigned NumSources = BuildVector->getNumSources(); + for (unsigned I = 0; I < NumSources; ++I) { + std::optional IConstant = + getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI); + if (!IConstant) + return false; + } + return true; // FIXME: G_SPLAT_VECTOR +} + // TODO: use knownbits to determine zeros bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo) { @@ -6906,3 +6923,195 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) { return false; } + +bool CombinerHelper::isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const { + // Copy. + if (ToTy == FromTy) + return true; + + if (isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {ToTy, FromTy}})) + return true; + + if (isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {ToTy, FromTy}})) + return true; + + return false; +} + +bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, + BuildFnTy &MatchInfo) { + GAddCarryInOut *Add = cast(&MI); + + // adde has no flags. + Register Dst = Add->getDstReg(); + Register Carry = Add->getCarryOutReg(); + Register CarryIn = Add->getCarryInReg(); + Register LHS = Add->getLHSReg(); + Register RHS = Add->getRHSReg(); + bool IsSigned = Add->isSigned(); + LLT DstTy = MRI.getType(Dst); + LLT CarryTy = MRI.getType(Carry); + LLT OperandTy = MRI.getType(LHS); + LLT CarryInTy = MRI.getType(CarryIn); + + // FIXME: handle undef + + // fold sadde, if the carry is dead -> add(add(LHS, RHS), + // zextOrTrunc(CarryIn)), undef. + if (MRI.use_nodbg_empty(Carry) && IsSigned && MRI.hasOneNonDBGUse(Dst) && + isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) && + isZExtOrTruncLegal(DstTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto A = B.buildAdd(DstTy, LHS, RHS); + Register AReg = A.getReg(0); + auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn); + Register ZextCarryInReg = ZextCarryIn.getReg(0); + B.buildAdd(Dst, AReg, ZextCarryInReg); + B.buildUndef(Carry); + }; + return true; + } + + // We want do fold the [u|s]adde. + if (!MRI.hasOneNonDBGUse(Dst) || !MRI.hasOneNonDBGUse(Carry)) + return false; + + // The parameters of the adde must be integer-like. + std::optional MaybeLHS = getConstantOrConstantSplatVector(LHS); + std::optional MaybeRHS = getConstantOrConstantSplatVector(RHS); + std::optional MaybeCarryIn = getConstantOrConstantSplatVector(CarryIn); + + // fold adde(c, c, c) -> c, carry + if (MaybeLHS && MaybeRHS && MaybeCarryIn && + isConstantLegalOrBeforeLegalizer(DstTy) && + isConstantLegalOrBeforeLegalizer(CarryTy)) { + // They must all have the same bitwidth. Otherwise APInt might + // assert. Prelegalization, they may have widely different bitwidths. + unsigned BitWidth = + std::max(std::max(MaybeLHS->getBitWidth(), MaybeRHS->getBitWidth()), + MaybeCarryIn->getBitWidth()); + if (IsSigned) { + APInt LHS = MaybeLHS->sext(BitWidth); + APInt RHS = MaybeRHS->sext(BitWidth); + APInt CarryIn = MaybeCarryIn->zext(BitWidth); + bool FirstOverflowed = false; + bool SecondOverflowed = false; + APInt Result = + LHS.sadd_ov(RHS, FirstOverflowed).sadd_ov(CarryIn, SecondOverflowed); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildConstant(Dst, Result); + B.buildConstant(Carry, FirstOverflowed | SecondOverflowed); + }; + return true; + } else if (!IsSigned) { + APInt LHS = MaybeLHS->zext(BitWidth); + APInt RHS = MaybeRHS->zext(BitWidth); + APInt CarryIn = MaybeCarryIn->zext(BitWidth); + bool FirstOverflowed = false; + bool SecondOverflowed = false; + APInt Result = + LHS.uadd_ov(RHS, FirstOverflowed).uadd_ov(CarryIn, SecondOverflowed); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildConstant(Dst, Result); + B.buildConstant(Carry, FirstOverflowed | SecondOverflowed); + }; + return true; + } + } + + // canonicalize constant to RHS. + if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) { + if (IsSigned) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSAdde(Dst, Carry, RHS, LHS, CarryIn); + }; + return true; + } else { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildUAdde(Dst, Carry, RHS, LHS, CarryIn); + }; + return true; + } + } + + // fold adde(LHS, RHS, 0) -> addo(LHS, RHS) + if (MaybeCarryIn && *MaybeCarryIn == 0) { + if (IsSigned && isLegalOrBeforeLegalizer( + {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}})) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSAddo(Dst, Carry, LHS, RHS); + }; + return true; + } else if (!IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}})) + MatchInfo = [=](MachineIRBuilder &B) { + B.buildUAddo(Dst, Carry, LHS, RHS); + }; + return true; + } + + // fold adde(LHS, 0, Carry) -> addo(LHS, Carry) + if (MaybeRHS && *MaybeRHS == 0) { + if (IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + Register ZextCarryInReg = ZextCarryIn.getReg(0); + B.buildSAddo(Dst, Carry, LHS, ZextCarryInReg); + }; + return true; + } else if (!IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + Register ZextCarryInReg = ZextCarryIn.getReg(0); + B.buildUAddo(Dst, Carry, LHS, ZextCarryInReg); + }; + return true; + } + } + + // We lower to 2*addo + 1*or. + if (IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) && + isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto First = B.buildSAddo(DstTy, CarryTy, LHS, RHS); + Register FirstResult = First.getReg(0); + Register FirstCarry = First.getReg(1); + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto Second = B.buildSAddo(DstTy, CarryTy, FirstResult, ZextCarryIn); + Register Result = Second.getReg(0); + Register SecondCarry = Second.getReg(1); + B.buildCopy(Dst, Result); + B.buildOr(Carry, FirstCarry, SecondCarry); + }; + return true; + } else if (!IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) && + isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto First = B.buildUAddo(DstTy, CarryTy, LHS, RHS); + Register FirstResult = First.getReg(0); + Register FirstCarry = First.getReg(1); + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto Second = B.buildUAddo(DstTy, CarryTy, FirstResult, ZextCarryIn); + Register Result = Second.getReg(0); + Register SecondCarry = Second.getReg(1); + B.buildCopy(Dst, Result); + B.buildOr(Carry, FirstCarry, SecondCarry); + }; + return true; + } + + return false; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir new file mode 100644 index 0000000000000..61c7f56f4b260 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir @@ -0,0 +1,300 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +--- +# add, _ = sadde(_, _, In) +name: carryout_unused +body: | + bb.0.entry: + ; CHECK-LABEL: name: carryout_unused + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: %add:_(s64) = G_ADD [[ADD]], [[ZEXT]] + ; CHECK-NEXT: $x0 = COPY %add(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + $x0 = COPY %add +... +--- +# add, _ = uadde(_, _, In) +name: carryout_unused_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: carryout_unused_unsigned + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_UADDE [[COPY]], [[COPY]], %carry_in + ; CHECK-NEXT: $x0 = COPY %add(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + $x0 = COPY %add +... +--- +# add, multi_c = sadde(L, R, In) +name: multi_use_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_use_unsigned + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_UADDE [[COPY]], [[COPY]], %carry_in + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: %carry_out_ext2:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + ; CHECK-NEXT: $x2 = COPY %carry_out_ext2(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + %carry_out_ext2:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext + $x2 = COPY %carry_out_ext2 +... +--- +# add, c = sadde(L, R, In) +name: constant_fold_signed +body: | + bb.0.entry: + ; CHECK-LABEL: name: constant_fold_signed + ; CHECK: %add:_(s64) = G_CONSTANT i64 29 + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = G_CONSTANT i64 1 + %lhs:_(s64) = G_CONSTANT i64 11 + %rhs:_(s64) = G_CONSTANT i64 17 + %carry_in:_(s1) = G_CONSTANT i1 1 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, In) +name: constant_fold_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: constant_fold_unsigned + ; CHECK: %add:_(s64) = G_CONSTANT i64 27 + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = G_CONSTANT i64 1 + %lhs:_(s64) = G_CONSTANT i64 19 + %rhs:_(s64) = G_CONSTANT i64 7 + %carry_in:_(s1) = G_CONSTANT i1 1 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, In) +name: canonicalize_to_rhs_plus_lower +body: | + bb.0.entry: + ; CHECK-LABEL: name: canonicalize_to_rhs_plus_lower + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %lhs:_(s64) = G_CONSTANT i64 19 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[COPY]], %lhs + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[UADDO1]], [[UADDO3]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY [[UADDO2]](s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = G_CONSTANT i64 19 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = sadde(L, R, 0) +name: fold_to_addo_l_r +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_addo_l_r + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_SADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %4 + %carry_in:_(s1) = G_CONSTANT i1 0 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = sadde(L, 0, CarryIn) +name: fold_to_addo_l_carryin +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_addo_l_carryin + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_SADDO [[COPY]], [[ZEXT]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = G_CONSTANT i64 0 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = sadde(L, R, CarryIn) +name: fold_to_lower_signed +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_lower_signed + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[SADDO:%[0-9]+]]:_(s64), [[SADDO1:%[0-9]+]]:_(s1) = G_SADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: [[SADDO2:%[0-9]+]]:_(s64), [[SADDO3:%[0-9]+]]:_(s1) = G_SADDO [[SADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[SADDO1]], [[SADDO3]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY [[SADDO2]](s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %4 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, CarryIn) +name: fold_to_lower_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_lower_unsigned + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[UADDO1]], [[UADDO3]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY [[UADDO2]](s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %4 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, CarryIn) +name: fold_to_lower_vectorized +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_lower_vectorized + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %onebit:_(s1) = G_TRUNC [[COPY4]](s64) + ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64) + ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY3]](s64) + ; CHECK-NEXT: %carry_in:_(<2 x s1>) = G_BUILD_VECTOR %onebit(s1), %onebit(s1) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(<2 x s64>), [[UADDO1:%[0-9]+]]:_(<2 x s1>) = G_UADDO %lhs, %rhs + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s64>) = G_ZEXT %carry_in(<2 x s1>) + ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(<2 x s64>), [[UADDO3:%[0-9]+]]:_(<2 x s1>) = G_UADDO [[UADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(<2 x s1>) = G_OR [[UADDO1]], [[UADDO3]] + ; CHECK-NEXT: %zext:_(<2 x s64>) = G_ZEXT %carry_out(<2 x s1>) + ; CHECK-NEXT: $q0 = COPY %zext(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[UADDO2]](<2 x s64>) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %onebit:_(s1) = G_TRUNC %4 + %lhs:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) + %rhs:_(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64) + %carry_in:_(<2 x s1>) = G_BUILD_VECTOR %onebit(s1), %onebit(s1) + %add:_(<2 x s64>), %carry_out:_(<2 x s1>) = G_UADDE %lhs, %rhs, %carry_in + %zext:_(<2 x s64>) = G_ZEXT %carry_out(<2 x s1>) + $q0 = COPY %zext + $q0 = COPY %add +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll index ff5880819020d..f337e6cf55292 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -39,9 +39,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) { ; GFX7-LABEL: v_uaddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX7-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -49,9 +52,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) { ; GFX8-LABEL: v_uaddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -59,9 +65,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) { ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -477,8 +486,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_uaddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_add_u32 s1, s1, s3 +; GFX7-NEXT: s_cselect_b32 s3, 1, 0 +; GFX7-NEXT: s_add_u32 s1, s1, s2 +; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_or_b32 s2, s3, s2 +; GFX7-NEXT: s_and_b32 s2, s2, 1 ; GFX7-NEXT: s_add_u32 s0, s0, s2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: ; return to shader part epilog @@ -486,8 +500,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_uaddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_add_u32 s0, s0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: ; return to shader part epilog @@ -495,8 +514,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 4c1935d06517e..eff845a146ace 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1084,7 +1084,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: v_mov_b32_e32 v1, s12 ; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX7-NEXT: s_mul_i32 s18, s1, s8 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 ; GFX7-NEXT: s_add_u32 s18, s18, s17 ; GFX7-NEXT: s_addc_u32 s17, s23, s22 ; GFX7-NEXT: v_mov_b32_e32 v4, s11 @@ -1095,33 +1095,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_mul_i32 s24, s1, s11 ; GFX7-NEXT: v_readfirstlane_b32 s28, v3 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_readfirstlane_b32 s27, v5 +; GFX7-NEXT: v_readfirstlane_b32 s25, v5 ; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 -; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s27, s23 +; GFX7-NEXT: s_addc_u32 s23, s25, s23 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX7-NEXT: s_mul_i32 s27, s2, s10 +; GFX7-NEXT: s_mul_i32 s25, s2, s10 ; GFX7-NEXT: s_cselect_b32 s22, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s27, s24 +; GFX7-NEXT: s_add_u32 s24, s25, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX7-NEXT: s_addc_u32 s27, s28, s23 +; GFX7-NEXT: s_addc_u32 s25, s28, s23 ; GFX7-NEXT: s_mul_i32 s28, s3, s9 ; GFX7-NEXT: s_cselect_b32 s23, 1, 0 ; GFX7-NEXT: s_add_u32 s28, s28, s24 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX7-NEXT: s_addc_u32 s27, s29, s27 +; GFX7-NEXT: s_addc_u32 s25, s29, s25 ; GFX7-NEXT: s_mul_i32 s29, s4, s8 ; GFX7-NEXT: s_cselect_b32 s24, 1, 0 ; GFX7-NEXT: s_add_u32 s28, s29, s28 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX7-NEXT: s_addc_u32 s27, s30, s27 +; GFX7-NEXT: s_addc_u32 s29, s30, s25 ; GFX7-NEXT: s_mul_i32 s30, s16, s11 -; GFX7-NEXT: s_cselect_b32 s29, 1, 0 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6 ; GFX7-NEXT: s_add_u32 s19, s30, s19 ; GFX7-NEXT: s_addc_u32 s28, s31, s28 @@ -1139,88 +1139,93 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 -; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: s_addc_u32 s28, s35, s28 -; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s26, 0 -; GFX7-NEXT: s_addc_u32 s19, s25, s19 -; GFX7-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_add_u32 s19, s26, s19 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_add_u32 s19, s19, s27 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_or_b32 s26, s26, s27 +; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 -; GFX7-NEXT: v_readfirstlane_b32 s26, v0 +; GFX7-NEXT: s_add_u32 s20, s20, s28 +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: s_cselect_b32 s21, 1, 0 +; GFX7-NEXT: s_and_b32 s26, s26, 1 +; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX7-NEXT: s_add_u32 s20, s20, s26 +; GFX7-NEXT: v_readfirstlane_b32 s27, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX7-NEXT: s_cmp_lg_u32 s25, 0 -; GFX7-NEXT: s_addc_u32 s20, s20, s28 -; GFX7-NEXT: s_mul_i32 s25, s16, s14 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_or_b32 s21, s21, s26 +; GFX7-NEXT: s_mul_i32 s26, s16, s14 ; GFX7-NEXT: s_mul_i32 s28, s1, s13 -; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 ; GFX7-NEXT: s_mul_i32 s28, s2, s12 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 ; GFX7-NEXT: s_mul_i32 s28, s3, s11 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 ; GFX7-NEXT: s_mul_i32 s28, s4, s10 -; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_add_u32 s26, s28, s26 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 ; GFX7-NEXT: s_mul_i32 s28, s5, s9 -; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_add_u32 s26, s28, s26 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: s_mul_i32 s28, s6, s8 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: s_mul_i32 s28, s16, s13 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2 -; GFX7-NEXT: s_add_u32 s27, s28, s27 +; GFX7-NEXT: s_add_u32 s28, s28, s29 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX7-NEXT: s_addc_u32 s25, s35, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: s_mul_i32 s35, s1, s12 -; GFX7-NEXT: s_cselect_b32 s28, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s35, s27 -; GFX7-NEXT: s_addc_u32 s25, s36, s25 +; GFX7-NEXT: s_cselect_b32 s29, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s35, s28 +; GFX7-NEXT: s_addc_u32 s26, s36, s26 ; GFX7-NEXT: s_mul_i32 s36, s2, s11 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s36, s27 +; GFX7-NEXT: s_add_u32 s28, s36, s28 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX7-NEXT: s_addc_u32 s25, s37, s25 +; GFX7-NEXT: s_addc_u32 s26, s37, s26 ; GFX7-NEXT: s_mul_i32 s37, s3, s10 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s37, s27 +; GFX7-NEXT: s_add_u32 s28, s37, s28 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX7-NEXT: s_addc_u32 s25, s38, s25 +; GFX7-NEXT: s_addc_u32 s26, s38, s26 ; GFX7-NEXT: s_mul_i32 s38, s4, s9 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1 -; GFX7-NEXT: s_add_u32 s27, s38, s27 -; GFX7-NEXT: s_addc_u32 s25, s39, s25 +; GFX7-NEXT: s_add_u32 s28, s38, s28 +; GFX7-NEXT: s_addc_u32 s26, s39, s26 ; GFX7-NEXT: s_mul_i32 s39, s5, s8 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0 -; GFX7-NEXT: s_add_u32 s27, s39, s27 -; GFX7-NEXT: s_addc_u32 s25, s40, s25 +; GFX7-NEXT: s_add_u32 s28, s39, s28 +; GFX7-NEXT: s_addc_u32 s26, s40, s26 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 @@ -1228,19 +1233,28 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s34, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 -; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: s_addc_u32 s21, s30, s27 -; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s30, s28 +; GFX7-NEXT: s_cselect_b32 s30, 1, 0 +; GFX7-NEXT: s_and_b32 s21, s21, 1 +; GFX7-NEXT: s_add_u32 s21, s28, s21 +; GFX7-NEXT: s_cselect_b32 s28, 1, 0 +; GFX7-NEXT: s_or_b32 s28, s30, s28 ; GFX7-NEXT: s_cmp_lg_u32 s23, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 -; GFX7-NEXT: s_cmp_lg_u32 s29, 0 +; GFX7-NEXT: s_cmp_lg_u32 s25, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 -; GFX7-NEXT: s_cmp_lg_u32 s27, 0 -; GFX7-NEXT: s_addc_u32 s22, s22, s25 +; GFX7-NEXT: s_add_u32 s22, s22, s26 +; GFX7-NEXT: s_cselect_b32 s23, 1, 0 +; GFX7-NEXT: s_and_b32 s24, s28, 1 +; GFX7-NEXT: s_add_u32 s22, s22, s24 +; GFX7-NEXT: s_cselect_b32 s24, 1, 0 +; GFX7-NEXT: s_or_b32 s23, s23, s24 ; GFX7-NEXT: s_mul_i32 s16, s16, s15 -; GFX7-NEXT: s_addc_u32 s15, s26, s16 +; GFX7-NEXT: s_and_b32 s15, s23, 1 +; GFX7-NEXT: s_cmp_lg_u32 s15, 0 +; GFX7-NEXT: s_addc_u32 s15, s27, s16 ; GFX7-NEXT: s_mul_i32 s1, s1, s14 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0 ; GFX7-NEXT: s_addc_u32 s1, s15, s1 @@ -1257,7 +1271,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s35, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_mul_i32 s6, s6, s9 -; GFX7-NEXT: s_cmp_lg_u32 s28, 0 +; GFX7-NEXT: s_cmp_lg_u32 s29, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s6 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s0, s0, s8 @@ -1305,7 +1319,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: s_mul_i32 s18, s1, s8 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 ; GFX8-NEXT: s_add_u32 s18, s18, s17 ; GFX8-NEXT: s_addc_u32 s17, s23, s22 ; GFX8-NEXT: v_mov_b32_e32 v4, s11 @@ -1316,33 +1330,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_mul_i32 s24, s1, s11 ; GFX8-NEXT: v_readfirstlane_b32 s28, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_readfirstlane_b32 s27, v5 +; GFX8-NEXT: v_readfirstlane_b32 s25, v5 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 -; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s27, s23 +; GFX8-NEXT: s_addc_u32 s23, s25, s23 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX8-NEXT: s_mul_i32 s27, s2, s10 +; GFX8-NEXT: s_mul_i32 s25, s2, s10 ; GFX8-NEXT: s_cselect_b32 s22, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s27, s24 +; GFX8-NEXT: s_add_u32 s24, s25, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX8-NEXT: s_addc_u32 s27, s28, s23 +; GFX8-NEXT: s_addc_u32 s25, s28, s23 ; GFX8-NEXT: s_mul_i32 s28, s3, s9 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 ; GFX8-NEXT: s_add_u32 s28, s28, s24 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX8-NEXT: s_addc_u32 s27, s29, s27 +; GFX8-NEXT: s_addc_u32 s25, s29, s25 ; GFX8-NEXT: s_mul_i32 s29, s4, s8 ; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_add_u32 s28, s29, s28 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX8-NEXT: s_addc_u32 s27, s30, s27 +; GFX8-NEXT: s_addc_u32 s29, s30, s25 ; GFX8-NEXT: s_mul_i32 s30, s16, s11 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6 ; GFX8-NEXT: s_add_u32 s19, s30, s19 ; GFX8-NEXT: s_addc_u32 s28, s31, s28 @@ -1360,88 +1374,93 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: s_addc_u32 s28, s35, s28 -; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 -; GFX8-NEXT: s_addc_u32 s19, s25, s19 -; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_add_u32 s19, s26, s19 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_add_u32 s19, s19, s27 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_or_b32 s26, s26, s27 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 -; GFX8-NEXT: v_readfirstlane_b32 s26, v0 +; GFX8-NEXT: s_add_u32 s20, s20, s28 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_and_b32 s26, s26, 1 +; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX8-NEXT: s_add_u32 s20, s20, s26 +; GFX8-NEXT: v_readfirstlane_b32 s27, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX8-NEXT: s_cmp_lg_u32 s25, 0 -; GFX8-NEXT: s_addc_u32 s20, s20, s28 -; GFX8-NEXT: s_mul_i32 s25, s16, s14 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_or_b32 s21, s21, s26 +; GFX8-NEXT: s_mul_i32 s26, s16, s14 ; GFX8-NEXT: s_mul_i32 s28, s1, s13 -; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 ; GFX8-NEXT: s_mul_i32 s28, s2, s12 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 ; GFX8-NEXT: s_mul_i32 s28, s3, s11 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 ; GFX8-NEXT: s_mul_i32 s28, s4, s10 -; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_add_u32 s26, s28, s26 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 ; GFX8-NEXT: s_mul_i32 s28, s5, s9 -; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_add_u32 s26, s28, s26 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: s_mul_i32 s28, s6, s8 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: s_mul_i32 s28, s16, s13 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2 -; GFX8-NEXT: s_add_u32 s27, s28, s27 +; GFX8-NEXT: s_add_u32 s28, s28, s29 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX8-NEXT: s_addc_u32 s25, s35, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: s_mul_i32 s35, s1, s12 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s35, s27 -; GFX8-NEXT: s_addc_u32 s25, s36, s25 +; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s35, s28 +; GFX8-NEXT: s_addc_u32 s26, s36, s26 ; GFX8-NEXT: s_mul_i32 s36, s2, s11 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s36, s27 +; GFX8-NEXT: s_add_u32 s28, s36, s28 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX8-NEXT: s_addc_u32 s25, s37, s25 +; GFX8-NEXT: s_addc_u32 s26, s37, s26 ; GFX8-NEXT: s_mul_i32 s37, s3, s10 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s37, s27 +; GFX8-NEXT: s_add_u32 s28, s37, s28 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX8-NEXT: s_addc_u32 s25, s38, s25 +; GFX8-NEXT: s_addc_u32 s26, s38, s26 ; GFX8-NEXT: s_mul_i32 s38, s4, s9 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1 -; GFX8-NEXT: s_add_u32 s27, s38, s27 -; GFX8-NEXT: s_addc_u32 s25, s39, s25 +; GFX8-NEXT: s_add_u32 s28, s38, s28 +; GFX8-NEXT: s_addc_u32 s26, s39, s26 ; GFX8-NEXT: s_mul_i32 s39, s5, s8 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0 -; GFX8-NEXT: s_add_u32 s27, s39, s27 -; GFX8-NEXT: s_addc_u32 s25, s40, s25 +; GFX8-NEXT: s_add_u32 s28, s39, s28 +; GFX8-NEXT: s_addc_u32 s26, s40, s26 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 @@ -1449,19 +1468,28 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s34, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 -; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_addc_u32 s21, s30, s27 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s30, s28 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 +; GFX8-NEXT: s_and_b32 s21, s21, 1 +; GFX8-NEXT: s_add_u32 s21, s28, s21 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_or_b32 s28, s30, s28 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_addc_u32 s22, s22, s25 +; GFX8-NEXT: s_add_u32 s22, s22, s26 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_and_b32 s24, s28, 1 +; GFX8-NEXT: s_add_u32 s22, s22, s24 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 +; GFX8-NEXT: s_or_b32 s23, s23, s24 ; GFX8-NEXT: s_mul_i32 s16, s16, s15 -; GFX8-NEXT: s_addc_u32 s15, s26, s16 +; GFX8-NEXT: s_and_b32 s15, s23, 1 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_addc_u32 s15, s27, s16 ; GFX8-NEXT: s_mul_i32 s1, s1, s14 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0 ; GFX8-NEXT: s_addc_u32 s1, s15, s1 @@ -1478,7 +1506,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s35, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_mul_i32 s6, s6, s9 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s0, s0, s8 @@ -1510,15 +1538,15 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 ; GFX9-NEXT: s_add_u32 s17, s22, s17 -; GFX9-NEXT: s_addc_u32 s18, s23, s18 -; GFX9-NEXT: s_mul_i32 s23, s1, s8 -; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_addc_u32 s22, s23, s18 +; GFX9-NEXT: s_mul_i32 s18, s1, s8 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 -; GFX9-NEXT: s_add_u32 s17, s23, s17 -; GFX9-NEXT: s_addc_u32 s18, s24, s18 +; GFX9-NEXT: s_add_u32 s18, s18, s17 +; GFX9-NEXT: s_addc_u32 s17, s24, s22 ; GFX9-NEXT: s_mul_i32 s24, s16, s12 ; GFX9-NEXT: s_mul_i32 s26, s1, s11 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 ; GFX9-NEXT: s_add_u32 s24, s26, s24 @@ -1559,16 +1587,21 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_add_u32 s19, s34, s19 ; GFX9-NEXT: s_addc_u32 s24, s35, s24 ; GFX9-NEXT: s_cselect_b32 s34, 1, 0 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_addc_u32 s19, s22, s19 +; GFX9-NEXT: s_add_u32 s19, s23, s19 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 +; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_or_b32 s22, s23, s22 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, 0 -; GFX9-NEXT: s_cmp_lg_u32 s22, 0 -; GFX9-NEXT: s_addc_u32 s20, s20, s24 +; GFX9-NEXT: s_add_u32 s20, s20, s24 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 +; GFX9-NEXT: s_and_b32 s22, s22, 1 +; GFX9-NEXT: s_add_u32 s20, s20, s22 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_or_b32 s21, s21, s22 ; GFX9-NEXT: s_mul_i32 s22, s16, s14 ; GFX9-NEXT: s_mul_i32 s24, s1, s13 -; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 ; GFX9-NEXT: s_add_u32 s22, s24, s22 @@ -1629,18 +1662,27 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_addc_u32 s30, s30, 0 ; GFX9-NEXT: s_cmp_lg_u32 s34, 0 ; GFX9-NEXT: s_addc_u32 s30, s30, 0 -; GFX9-NEXT: s_cmp_lg_u32 s21, 0 -; GFX9-NEXT: s_addc_u32 s21, s30, s24 +; GFX9-NEXT: s_add_u32 s24, s30, s24 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 +; GFX9-NEXT: s_and_b32 s21, s21, 1 +; GFX9-NEXT: s_add_u32 s21, s24, s21 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 +; GFX9-NEXT: s_or_b32 s24, s30, s24 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 -; GFX9-NEXT: s_cmp_lg_u32 s24, 0 -; GFX9-NEXT: s_addc_u32 s22, s26, s22 +; GFX9-NEXT: s_add_u32 s22, s26, s22 +; GFX9-NEXT: s_cselect_b32 s26, 1, 0 +; GFX9-NEXT: s_and_b32 s24, s24, 1 +; GFX9-NEXT: s_add_u32 s22, s22, s24 +; GFX9-NEXT: s_cselect_b32 s24, 1, 0 +; GFX9-NEXT: s_or_b32 s24, s26, s24 ; GFX9-NEXT: s_mul_i32 s16, s16, s15 +; GFX9-NEXT: s_and_b32 s15, s24, 1 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_addc_u32 s15, s23, s16 ; GFX9-NEXT: s_mul_i32 s1, s1, s14 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0 @@ -1663,192 +1705,399 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_i32 s0, s0, s8 ; GFX9-NEXT: s_add_u32 s7, s7, s1 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s18 +; GFX9-NEXT: s_mov_b32 s2, s17 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s4, s20 ; GFX9-NEXT: s_mov_b32 s5, s21 ; GFX9-NEXT: s_mov_b32 s6, s22 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_mul_i256: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10 -; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9 -; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17 -; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18 -; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17 -; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 -; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16 -; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17 -; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16 -; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17 -; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12 -; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12 -; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11 -; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24 -; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10 -; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24 -; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24 -; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24 -; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11 -; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23 -; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10 -; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23 -; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23 -; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23 -; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0 -; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14 -; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 -; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 -; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 -; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14 -; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23 -; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13 -; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13 -; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24 -; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21 -; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12 -; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 -; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21 -; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 -; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21 -; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10 -; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21 -; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21 -; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21 -; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14 -; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0 -; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 -; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0 -; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 -; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 -; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23 -; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15 -; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 -; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 -; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 -; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21 -; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0 -; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0 -; GFX10PLUS-NEXT: s_mov_b32 s2, s17 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0 -; GFX10PLUS-NEXT: s_mov_b32 s3, s18 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0 -; GFX10PLUS-NEXT: s_mov_b32 s4, s19 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10PLUS-NEXT: s_mov_b32 s5, s20 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6 -; GFX10PLUS-NEXT: s_mov_b32 s6, s15 -; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7 -; GFX10PLUS-NEXT: s_mov_b32 s1, s16 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_mul_i256: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s17, s0, s10 +; GFX10-NEXT: s_mul_i32 s19, s1, s9 +; GFX10-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX10-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX10-NEXT: s_add_u32 s17, s19, s17 +; GFX10-NEXT: s_addc_u32 s18, s20, s18 +; GFX10-NEXT: s_mul_i32 s20, s2, s8 +; GFX10-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_add_u32 s17, s20, s17 +; GFX10-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX10-NEXT: s_addc_u32 s18, s21, s18 +; GFX10-NEXT: s_mul_i32 s21, s0, s9 +; GFX10-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_add_u32 s16, s21, s16 +; GFX10-NEXT: s_addc_u32 s21, s22, s17 +; GFX10-NEXT: s_mul_i32 s17, s1, s8 +; GFX10-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_add_u32 s17, s17, s16 +; GFX10-NEXT: s_addc_u32 s16, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s0, s12 +; GFX10-NEXT: s_mul_i32 s25, s1, s11 +; GFX10-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX10-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s25, s23 +; GFX10-NEXT: s_addc_u32 s24, s26, s24 +; GFX10-NEXT: s_mul_i32 s26, s2, s10 +; GFX10-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s26, s23 +; GFX10-NEXT: s_addc_u32 s24, s27, s24 +; GFX10-NEXT: s_mul_i32 s27, s3, s9 +; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX10-NEXT: s_cselect_b32 s26, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s27, s23 +; GFX10-NEXT: s_addc_u32 s24, s28, s24 +; GFX10-NEXT: s_mul_i32 s28, s4, s8 +; GFX10-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX10-NEXT: s_cselect_b32 s27, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s28, s23 +; GFX10-NEXT: s_addc_u32 s24, s29, s24 +; GFX10-NEXT: s_mul_i32 s29, s0, s11 +; GFX10-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s29, s18 +; GFX10-NEXT: s_addc_u32 s23, s30, s23 +; GFX10-NEXT: s_mul_i32 s30, s1, s10 +; GFX10-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s30, s18 +; GFX10-NEXT: s_addc_u32 s23, s31, s23 +; GFX10-NEXT: s_mul_i32 s31, s2, s9 +; GFX10-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX10-NEXT: s_cselect_b32 s30, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s31, s18 +; GFX10-NEXT: s_addc_u32 s23, s33, s23 +; GFX10-NEXT: s_mul_i32 s33, s3, s8 +; GFX10-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX10-NEXT: s_cselect_b32 s31, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s33, s18 +; GFX10-NEXT: s_addc_u32 s23, s34, s23 +; GFX10-NEXT: s_cselect_b32 s33, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s22, s18 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s18, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX10-NEXT: s_or_b32 s21, s22, s21 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX10-NEXT: s_addc_u32 s19, s19, 0 +; GFX10-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_i32 s23, s1, s13 +; GFX10-NEXT: s_add_u32 s19, s19, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX10-NEXT: s_or_b32 s20, s20, s21 +; GFX10-NEXT: s_mul_i32 s21, s0, s14 +; GFX10-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s2, s12 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s3, s11 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s4, s10 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s5, s9 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s6, s8 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s0, s13 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX10-NEXT: s_add_u32 s23, s23, s24 +; GFX10-NEXT: s_addc_u32 s21, s34, s21 +; GFX10-NEXT: s_mul_i32 s34, s1, s12 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s34, s23 +; GFX10-NEXT: s_addc_u32 s21, s35, s21 +; GFX10-NEXT: s_mul_i32 s35, s2, s11 +; GFX10-NEXT: s_cselect_b32 s34, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s35, s23 +; GFX10-NEXT: s_addc_u32 s21, s36, s21 +; GFX10-NEXT: s_mul_i32 s36, s3, s10 +; GFX10-NEXT: s_cselect_b32 s35, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s36, s23 +; GFX10-NEXT: s_addc_u32 s21, s37, s21 +; GFX10-NEXT: s_mul_i32 s37, s4, s9 +; GFX10-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX10-NEXT: s_cselect_b32 s36, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s37, s23 +; GFX10-NEXT: s_addc_u32 s21, s38, s21 +; GFX10-NEXT: s_mul_i32 s38, s5, s8 +; GFX10-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX10-NEXT: s_cselect_b32 s37, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s38, s23 +; GFX10-NEXT: s_addc_u32 s21, s39, s21 +; GFX10-NEXT: s_cselect_b32 s38, 1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10-NEXT: s_mul_i32 s15, s0, s15 +; GFX10-NEXT: s_addc_u32 s29, s29, 0 +; GFX10-NEXT: s_cmp_lg_u32 s31, 0 +; GFX10-NEXT: s_mul_i32 s1, s1, s14 +; GFX10-NEXT: s_addc_u32 s29, s29, 0 +; GFX10-NEXT: s_cmp_lg_u32 s33, 0 +; GFX10-NEXT: s_mul_i32 s2, s2, s13 +; GFX10-NEXT: s_addc_u32 s29, s29, 0 +; GFX10-NEXT: s_mul_i32 s3, s3, s12 +; GFX10-NEXT: s_add_u32 s23, s29, s23 +; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_mul_i32 s4, s4, s11 +; GFX10-NEXT: s_add_u32 s20, s23, s20 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s23, s29, s23 +; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_mul_i32 s6, s6, s9 +; GFX10-NEXT: s_addc_u32 s25, s25, 0 +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_mul_i32 s7, s7, s8 +; GFX10-NEXT: s_addc_u32 s25, s25, 0 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_mul_i32 s0, s0, s8 +; GFX10-NEXT: s_addc_u32 s25, s25, 0 +; GFX10-NEXT: s_add_u32 s21, s25, s21 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_u32 s21, s21, s23 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_or_b32 s23, s25, s23 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_addc_u32 s15, s22, s15 +; GFX10-NEXT: s_cmp_lg_u32 s38, 0 +; GFX10-NEXT: s_addc_u32 s1, s15, s1 +; GFX10-NEXT: s_cmp_lg_u32 s37, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lg_u32 s36, 0 +; GFX10-NEXT: s_mov_b32 s2, s16 +; GFX10-NEXT: s_addc_u32 s1, s1, s3 +; GFX10-NEXT: s_cmp_lg_u32 s35, 0 +; GFX10-NEXT: s_mov_b32 s3, s18 +; GFX10-NEXT: s_addc_u32 s1, s1, s4 +; GFX10-NEXT: s_cmp_lg_u32 s34, 0 +; GFX10-NEXT: s_mov_b32 s4, s19 +; GFX10-NEXT: s_addc_u32 s1, s1, s5 +; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_mov_b32 s5, s20 +; GFX10-NEXT: s_addc_u32 s1, s1, s6 +; GFX10-NEXT: s_mov_b32 s6, s21 +; GFX10-NEXT: s_add_i32 s7, s1, s7 +; GFX10-NEXT: s_mov_b32 s1, s17 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_mul_i256: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mul_i32 s17, s0, s10 +; GFX11-NEXT: s_mul_i32 s19, s1, s9 +; GFX11-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX11-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX11-NEXT: s_add_u32 s17, s19, s17 +; GFX11-NEXT: s_addc_u32 s18, s20, s18 +; GFX11-NEXT: s_mul_i32 s20, s2, s8 +; GFX11-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_add_u32 s17, s20, s17 +; GFX11-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX11-NEXT: s_addc_u32 s18, s21, s18 +; GFX11-NEXT: s_mul_i32 s21, s0, s9 +; GFX11-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_add_u32 s16, s21, s16 +; GFX11-NEXT: s_addc_u32 s17, s22, s17 +; GFX11-NEXT: s_mul_i32 s22, s1, s8 +; GFX11-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_add_u32 s16, s22, s16 +; GFX11-NEXT: s_addc_u32 s17, s23, s17 +; GFX11-NEXT: s_mul_i32 s23, s0, s12 +; GFX11-NEXT: s_mul_i32 s25, s1, s11 +; GFX11-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX11-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s25, s23 +; GFX11-NEXT: s_addc_u32 s24, s26, s24 +; GFX11-NEXT: s_mul_i32 s26, s2, s10 +; GFX11-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX11-NEXT: s_cselect_b32 s25, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s26, s23 +; GFX11-NEXT: s_addc_u32 s24, s27, s24 +; GFX11-NEXT: s_mul_i32 s27, s3, s9 +; GFX11-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX11-NEXT: s_cselect_b32 s26, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s27, s23 +; GFX11-NEXT: s_addc_u32 s24, s28, s24 +; GFX11-NEXT: s_mul_i32 s28, s4, s8 +; GFX11-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX11-NEXT: s_cselect_b32 s27, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s28, s23 +; GFX11-NEXT: s_addc_u32 s24, s29, s24 +; GFX11-NEXT: s_mul_i32 s29, s0, s11 +; GFX11-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s29, s18 +; GFX11-NEXT: s_addc_u32 s23, s30, s23 +; GFX11-NEXT: s_mul_i32 s30, s1, s10 +; GFX11-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s30, s18 +; GFX11-NEXT: s_addc_u32 s23, s31, s23 +; GFX11-NEXT: s_mul_i32 s31, s2, s9 +; GFX11-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX11-NEXT: s_cselect_b32 s30, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s31, s18 +; GFX11-NEXT: s_addc_u32 s23, s33, s23 +; GFX11-NEXT: s_mul_i32 s33, s3, s8 +; GFX11-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX11-NEXT: s_cselect_b32 s31, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s33, s18 +; GFX11-NEXT: s_addc_u32 s23, s34, s23 +; GFX11-NEXT: s_cselect_b32 s33, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s21, s18 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s18, s22 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX11-NEXT: s_or_b32 s21, s21, s22 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX11-NEXT: s_add_u32 s19, s19, s23 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_and_b32 s21, s21, 1 +; GFX11-NEXT: s_mul_i32 s23, s1, s13 +; GFX11-NEXT: s_add_u32 s19, s19, s21 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX11-NEXT: s_or_b32 s20, s20, s21 +; GFX11-NEXT: s_mul_i32 s21, s0, s14 +; GFX11-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s2, s12 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s3, s11 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s4, s10 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s5, s9 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s6, s8 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s0, s13 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX11-NEXT: s_add_u32 s23, s23, s24 +; GFX11-NEXT: s_addc_u32 s21, s34, s21 +; GFX11-NEXT: s_mul_i32 s34, s1, s12 +; GFX11-NEXT: s_cselect_b32 s24, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s34, s23 +; GFX11-NEXT: s_addc_u32 s21, s35, s21 +; GFX11-NEXT: s_mul_i32 s35, s2, s11 +; GFX11-NEXT: s_cselect_b32 s34, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s35, s23 +; GFX11-NEXT: s_addc_u32 s21, s36, s21 +; GFX11-NEXT: s_mul_i32 s36, s3, s10 +; GFX11-NEXT: s_cselect_b32 s35, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s36, s23 +; GFX11-NEXT: s_addc_u32 s21, s37, s21 +; GFX11-NEXT: s_mul_i32 s37, s4, s9 +; GFX11-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX11-NEXT: s_cselect_b32 s36, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s37, s23 +; GFX11-NEXT: s_addc_u32 s21, s38, s21 +; GFX11-NEXT: s_mul_i32 s38, s5, s8 +; GFX11-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX11-NEXT: s_cselect_b32 s37, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s38, s23 +; GFX11-NEXT: s_addc_u32 s21, s39, s21 +; GFX11-NEXT: s_cselect_b32 s38, 1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s30, 0 +; GFX11-NEXT: s_mul_i32 s15, s0, s15 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_cmp_lg_u32 s31, 0 +; GFX11-NEXT: s_mul_i32 s1, s1, s14 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_cmp_lg_u32 s33, 0 +; GFX11-NEXT: s_mul_i32 s2, s2, s13 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_mul_i32 s3, s3, s12 +; GFX11-NEXT: s_add_u32 s23, s29, s23 +; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_and_b32 s20, s20, 1 +; GFX11-NEXT: s_mul_i32 s4, s4, s11 +; GFX11-NEXT: s_add_u32 s20, s23, s20 +; GFX11-NEXT: s_cselect_b32 s23, 1, 0 +; GFX11-NEXT: s_mul_i32 s5, s5, s10 +; GFX11-NEXT: s_or_b32 s23, s29, s23 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mul_i32 s6, s6, s9 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_mul_i32 s7, s7, s8 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mul_i32 s0, s0, s8 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s21, s25, s21 +; GFX11-NEXT: s_cselect_b32 s25, 1, 0 +; GFX11-NEXT: s_and_b32 s23, s23, 1 +; GFX11-NEXT: s_add_u32 s21, s21, s23 +; GFX11-NEXT: s_cselect_b32 s23, 1, 0 +; GFX11-NEXT: s_or_b32 s23, s25, s23 +; GFX11-NEXT: s_and_b32 s23, s23, 1 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_addc_u32 s15, s22, s15 +; GFX11-NEXT: s_cmp_lg_u32 s38, 0 +; GFX11-NEXT: s_addc_u32 s1, s15, s1 +; GFX11-NEXT: s_cmp_lg_u32 s37, 0 +; GFX11-NEXT: s_addc_u32 s1, s1, s2 +; GFX11-NEXT: s_cmp_lg_u32 s36, 0 +; GFX11-NEXT: s_mov_b32 s2, s17 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_cmp_lg_u32 s35, 0 +; GFX11-NEXT: s_mov_b32 s3, s18 +; GFX11-NEXT: s_addc_u32 s1, s1, s4 +; GFX11-NEXT: s_cmp_lg_u32 s34, 0 +; GFX11-NEXT: s_mov_b32 s4, s19 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s5, s20 +; GFX11-NEXT: s_addc_u32 s1, s1, s6 +; GFX11-NEXT: s_mov_b32 s6, s21 +; GFX11-NEXT: s_add_i32 s7, s1, s7 +; GFX11-NEXT: s_mov_b32 s1, s16 +; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i256: ; GFX12: ; %bb.0: @@ -1917,18 +2166,26 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s18, s33, s18 ; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23 ; GFX12-NEXT: s_cselect_b32 s33, 1, 0 -; GFX12-NEXT: s_cmp_lg_u32 s22, 0 -; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 -; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX12-NEXT: s_add_co_u32 s18, s21, s18 ; GFX12-NEXT: s_cselect_b32 s21, 1, 0 -; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_add_co_u32 s18, s18, s22 +; GFX12-NEXT: s_cselect_b32 s22, 1, 0 ; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX12-NEXT: s_or_b32 s21, s21, s22 +; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 ; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0 -; GFX12-NEXT: s_cmp_lg_u32 s21, 0 -; GFX12-NEXT: s_mul_i32 s21, s0, s14 -; GFX12-NEXT: s_add_co_ci_u32 s19, s19, s23 -; GFX12-NEXT: s_mul_i32 s23, s1, s13 +; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX12-NEXT: s_add_co_u32 s19, s19, s23 ; GFX12-NEXT: s_cselect_b32 s20, 1, 0 +; GFX12-NEXT: s_and_b32 s21, s21, 1 +; GFX12-NEXT: s_mul_i32 s23, s1, s13 +; GFX12-NEXT: s_add_co_u32 s19, s19, s21 +; GFX12-NEXT: s_cselect_b32 s21, 1, 0 +; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX12-NEXT: s_or_b32 s20, s20, s21 +; GFX12-NEXT: s_mul_i32 s21, s0, s14 +; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10 ; GFX12-NEXT: s_add_co_u32 s21, s23, s21 ; GFX12-NEXT: s_mul_i32 s23, s2, s12 ; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 @@ -1956,17 +2213,14 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s23, s23, s24 ; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21 ; GFX12-NEXT: s_mul_i32 s34, s1, s12 -; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX12-NEXT: s_cselect_b32 s24, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s34, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21 ; GFX12-NEXT: s_mul_i32 s35, s2, s11 -; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX12-NEXT: s_cselect_b32 s34, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s35, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21 ; GFX12-NEXT: s_mul_i32 s36, s3, s10 -; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10 ; GFX12-NEXT: s_cselect_b32 s35, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s36, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s37, s21 @@ -1982,34 +2236,46 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21 ; GFX12-NEXT: s_cselect_b32 s38, 1, 0 ; GFX12-NEXT: s_cmp_lg_u32 s30, 0 -; GFX12-NEXT: s_mul_i32 s1, s1, s14 +; GFX12-NEXT: s_mul_i32 s15, s0, s15 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s31, 0 -; GFX12-NEXT: s_mul_i32 s2, s2, s13 +; GFX12-NEXT: s_mul_i32 s1, s1, s14 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s33, 0 -; GFX12-NEXT: s_mul_i32 s3, s3, s12 +; GFX12-NEXT: s_mul_i32 s2, s2, s13 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 -; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_mul_i32 s3, s3, s12 +; GFX12-NEXT: s_add_co_u32 s23, s29, s23 +; GFX12-NEXT: s_cselect_b32 s29, 1, 0 +; GFX12-NEXT: s_and_b32 s20, s20, 1 ; GFX12-NEXT: s_mul_i32 s4, s4, s11 -; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23 +; GFX12-NEXT: s_add_co_u32 s20, s23, s20 ; GFX12-NEXT: s_cselect_b32 s23, 1, 0 +; GFX12-NEXT: s_mul_i32 s5, s5, s10 +; GFX12-NEXT: s_or_b32 s23, s29, s23 ; GFX12-NEXT: s_cmp_lg_u32 s26, 0 -; GFX12-NEXT: s_mul_i32 s26, s0, s15 +; GFX12-NEXT: s_mul_i32 s6, s6, s9 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s27, 0 -; GFX12-NEXT: s_mul_i32 s5, s5, s10 +; GFX12-NEXT: s_mul_i32 s7, s7, s8 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s28, 0 -; GFX12-NEXT: s_mul_i32 s6, s6, s9 +; GFX12-NEXT: s_mul_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_co_u32 s21, s25, s21 +; GFX12-NEXT: s_cselect_b32 s25, 1, 0 +; GFX12-NEXT: s_and_b32 s23, s23, 1 +; GFX12-NEXT: s_add_co_u32 s21, s21, s23 +; GFX12-NEXT: s_cselect_b32 s23, 1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s23, s25, s23 +; GFX12-NEXT: s_and_b32 s23, s23, 1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_lg_u32 s23, 0 -; GFX12-NEXT: s_mul_i32 s7, s7, s8 -; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21 -; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26 +; GFX12-NEXT: s_add_co_ci_u32 s15, s22, s15 ; GFX12-NEXT: s_cmp_lg_u32 s38, 0 -; GFX12-NEXT: s_mul_i32 s0, s0, s8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1 +; GFX12-NEXT: s_add_co_ci_u32 s1, s15, s1 ; GFX12-NEXT: s_cmp_lg_u32 s37, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2 ; GFX12-NEXT: s_cmp_lg_u32 s36, 0 @@ -2024,7 +2290,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_cmp_lg_u32 s24, 0 ; GFX12-NEXT: s_mov_b32 s5, s20 ; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s6 -; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_mov_b32 s6, s21 ; GFX12-NEXT: s_add_co_i32 s7, s1, s7 ; GFX12-NEXT: s_mov_b32 s1, s16 ; GFX12-NEXT: ; return to shader part epilog @@ -2037,208 +2303,244 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v16, v0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX7-NEXT: v_mov_b32_e32 v17, v1 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v18, v23 -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-NEXT: v_mov_b32_e32 v1, v23 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v22 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v22, s[6:7], 0, v22, s[6:7] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17] +; GFX7-NEXT: v_mov_b32_e32 v16, v19 +; GFX7-NEXT: v_mov_b32_e32 v17, v20 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v25, 0, 1, s[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20] +; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v25, s[10:11], 0, v25, s[10:11] +; GFX7-NEXT: v_add_i32_e64 v23, s[14:15], v23, v19 +; GFX7-NEXT: v_add_i32_e64 v24, s[16:17], v22, v20 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0 +; GFX7-NEXT: v_addc_u32_e64 v25, s[12:13], 0, v25, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20] +; GFX7-NEXT: v_addc_u32_e64 v25, s[6:7], 0, v25, s[6:7] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20] +; GFX7-NEXT: v_mov_b32_e32 v22, v19 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22] +; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21] +; GFX7-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v19, s[4:5], 0, v19, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v19, s[8:9], 0, v19, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22] ; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX7-NEXT: v_mul_lo_u32 v11, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22] +; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22] +; GFX7-NEXT: v_add_i32_e64 v21, s[6:7], v25, v21 +; GFX7-NEXT: v_add_i32_e64 v19, s[20:21], v19, v22 +; GFX7-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[22:23] +; GFX7-NEXT: v_add_i32_e64 v3, s[22:23], v23, v3 +; GFX7-NEXT: s_or_b64 s[14:15], s[14:15], s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[14:15] +; GFX7-NEXT: v_add_i32_e64 v4, s[14:15], v24, v4 +; GFX7-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[14:15] +; GFX7-NEXT: v_add_i32_e64 v5, s[14:15], v21, v5 +; GFX7-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] +; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX7-NEXT: v_add_i32_e64 v6, s[6:7], v19, v6 +; GFX7-NEXT: s_or_b64 s[6:7], s[20:21], s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v20, v0, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v1, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v2, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v12, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v11, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v25, vcc +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v22, s[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v16 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v16, v0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v18, v23 -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, v23 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v22 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v22, s[6:7], 0, v22, s[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17] +; GFX8-NEXT: v_mov_b32_e32 v16, v19 +; GFX8-NEXT: v_mov_b32_e32 v17, v20 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v25, 0, 1, s[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20] +; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v25, s[10:11], 0, v25, s[10:11] +; GFX8-NEXT: v_add_u32_e64 v23, s[14:15], v23, v19 +; GFX8-NEXT: v_add_u32_e64 v24, s[16:17], v22, v20 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0 +; GFX8-NEXT: v_addc_u32_e64 v25, s[12:13], 0, v25, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20] +; GFX8-NEXT: v_addc_u32_e64 v25, s[6:7], 0, v25, s[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20] +; GFX8-NEXT: v_mov_b32_e32 v22, v19 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22] +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21] +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v19, s[4:5], 0, v19, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v19, s[8:9], 0, v19, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22] ; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX8-NEXT: v_mul_lo_u32 v11, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22] +; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22] +; GFX8-NEXT: v_add_u32_e64 v21, s[6:7], v25, v21 +; GFX8-NEXT: v_add_u32_e64 v19, s[20:21], v19, v22 +; GFX8-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[22:23] +; GFX8-NEXT: v_add_u32_e64 v3, s[22:23], v23, v3 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[14:15] +; GFX8-NEXT: v_add_u32_e64 v4, s[14:15], v24, v4 +; GFX8-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[14:15] +; GFX8-NEXT: v_add_u32_e64 v5, s[14:15], v21, v5 +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] +; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX8-NEXT: v_add_u32_e64 v6, s[6:7], v19, v6 +; GFX8-NEXT: s_or_b64 s[6:7], s[20:21], s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v20, v0, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v1, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v2, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v12, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v11, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v25, vcc +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v22, s[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v16 +; GFX8-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-NEXT: v_mov_b32_e32 v2, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v20 -; GFX9-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v22, s[6:7], 0, v22, s[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17] +; GFX9-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, s[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[10:11], 0, v25, s[10:11] +; GFX9-NEXT: v_add_co_u32_e64 v23, s[14:15], v23, v19 +; GFX9-NEXT: v_add_co_u32_e64 v24, s[16:17], v22, v20 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[12:13], 0, v25, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[6:7], 0, v25, s[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20] +; GFX9-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21] +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[4:5], 0, v19, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[8:9], 0, v19, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22] ; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX9-NEXT: v_mul_lo_u32 v11, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22] +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22] +; GFX9-NEXT: v_add_co_u32_e64 v21, s[6:7], v25, v21 +; GFX9-NEXT: v_add_co_u32_e64 v19, s[20:21], v19, v22 +; GFX9-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[22:23] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[22:23], v23, v3 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[14:15] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[14:15], v24, v4 +; GFX9-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[14:15] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[14:15], v21, v5 +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] +; GFX9-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[6:7], v19, v6 +; GFX9-NEXT: s_or_b64 s[6:7], s[20:21], s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v20, v0, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v1, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v2, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v12, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v11, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v25, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v22, s[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2246,69 +2548,82 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 +; GFX10-NEXT: v_mov_b32_e32 v18, v2 ; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX10-NEXT: v_mul_lo_u32 v28, v4, v11 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s5, v16, v10, 0 +; GFX10-NEXT: v_mul_lo_u32 v29, v5, v10 +; GFX10-NEXT: v_mul_lo_u32 v14, v17, v14 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v17, v11, v[19:20] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s4, v17, v9, v[21:22] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20] ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s4, v18, v8, v[21:22] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e64 v25, s4, 0, v25, s4 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], s5, v6, v8, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v22 +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v1, v19 +; GFX10-NEXT: v_mov_b32_e32 v19, v20 +; GFX10-NEXT: v_mov_b32_e32 v20, v23 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v16, v11, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_mul_lo_u32 v13, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v10, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v2, s4 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s5, v17, v12, v[19:20] +; GFX10-NEXT: v_mov_b32_e32 v2, v21 +; GFX10-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v18, v9, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s6, v18, v11, v[19:20] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, v16, v9, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s4, v3, v8, v[22:23] +; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s7 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s7, v3, v10, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s4, 0, v6, s4 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v17, v8, v[1:2] +; GFX10-NEXT: v_add_co_u32 v3, s8, v23, v21 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v4, v9, v[10:11] +; GFX10-NEXT: v_add_co_u32 v4, s10, v25, v22 +; GFX10-NEXT: v_add_co_u32 v3, s9, v3, v6 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[9:10] +; GFX10-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX10-NEXT: v_add_co_u32 v4, s9, v4, v11 +; GFX10-NEXT: s_or_b32 s9, s10, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s9 +; GFX10-NEXT: v_add_co_u32 v5, s9, v19, v5 +; GFX10-NEXT: v_add_co_u32 v6, s11, v26, v6 +; GFX10-NEXT: v_add_co_u32 v5, s10, v5, v9 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s9 +; GFX10-NEXT: v_add_co_u32 v6, s9, v6, v9 +; GFX10-NEXT: s_or_b32 s9, s11, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v24, v10, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v14, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v13, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v12, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v29, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2316,69 +2631,81 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX11-NEXT: v_mov_b32_e32 v18, v2 ; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0 -; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], null, v16, v10, 0 +; GFX11-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX11-NEXT: v_mul_lo_u32 v29, v5, v10 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s0, v17, v11, v[19:20] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s0, v17, v9, v[21:22] +; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_mul_lo_u32 v14, v17, v14 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v20, v22 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX11-NEXT: v_mov_b32_e32 v20, v18 -; GFX11-NEXT: v_mov_b32_e32 v19, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX11-NEXT: v_mad_u64_u32 v[23:24], null, v6, v8, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, v19 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s0, v18, v8, v[21:22] +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, s0, 0, v25, s0 +; GFX11-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v23 +; GFX11-NEXT: v_mov_b32_e32 v0, v22 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20] +; GFX11-NEXT: v_mul_lo_u32 v13, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s0, v16, v11, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0 -; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX11-NEXT: v_mov_b32_e32 v14, v21 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19] -; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX11-NEXT: v_mov_b32_e32 v13, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12] -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s1, v17, v12, v[19:20] +; GFX11-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s0, v17, v10, v[22:23] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, v21 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s2, v18, v11, v[19:20] +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s0, v18, v9, v[22:23] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s3, v16, v9, v[1:2] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s0, v3, v8, v[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v23, 0, 1, s3 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s3, v3, v10, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v6, s0 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v17, v8, v[1:2] +; GFX11-NEXT: v_add_co_u32 v3, s4, v23, v21 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], s0, v4, v9, v[10:11] +; GFX11-NEXT: v_add_co_u32 v4, s6, v25, v22 +; GFX11-NEXT: v_add_co_u32 v3, s5, v3, v6 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[9:10] +; GFX11-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX11-NEXT: v_add_co_u32 v4, s5, v4, v11 +; GFX11-NEXT: s_or_b32 s5, s6, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX11-NEXT: v_add_co_u32 v5, s5, v19, v5 +; GFX11-NEXT: v_add_co_u32 v6, s7, v26, v6 +; GFX11-NEXT: v_add_co_u32 v5, s6, v5, v9 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX11-NEXT: v_add_co_u32 v6, s5, v6, v9 +; GFX11-NEXT: s_or_b32 s5, s7, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v24, v10, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v14, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v13, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v12, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v28, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v29, s1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo ; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2390,90 +2717,104 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: v_mov_b32_e32 v18, v2 ; GFX12-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], null, v16, v12, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], null, v16, v10, 0 +; GFX12-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v29, v5, v10 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], s0, v17, v11, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s0, v17, v9, v[21:22] +; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX12-NEXT: v_mul_lo_u32 v14, v17, v14 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v20, v22 -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], null, v6, v8, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v1, v19 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s0, v18, v8, v[21:22] +; GFX12-NEXT: v_add_co_ci_u32_e64 v25, s0, 0, v25, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v23 +; GFX12-NEXT: v_mov_b32_e32 v0, v22 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20] +; GFX12-NEXT: v_mul_lo_u32 v13, v18, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v16, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX12-NEXT: v_mov_b32_e32 v13, v1 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v14, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] -; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], s1, v17, v12, v[19:20] +; GFX12-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v10, v[22:23] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, v21 +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], s2, v18, v11, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v18, v9, v[22:23] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s3, v16, v9, v[1:2] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s0, v3, v8, v[22:23] +; GFX12-NEXT: v_cndmask_b32_e64 v23, 0, 1, s3 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v6, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v17, v8, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_u32 v3, s4, v23, v21 +; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s0, v4, v9, v[10:11] +; GFX12-NEXT: v_add_co_u32 v4, s6, v25, v22 +; GFX12-NEXT: v_add_co_u32 v3, s5, v3, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s4, s4, s5 +; GFX12-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[9:10] +; GFX12-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX12-NEXT: v_add_co_u32 v4, s5, v4, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s5, s6, s5 +; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v5, s5, v19, v5 +; GFX12-NEXT: v_add_co_u32 v6, s7, v26, v6 +; GFX12-NEXT: v_add_co_u32 v5, s6, v5, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s5, s5, s6 +; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: v_add_co_u32 v6, s5, v6, v9 +; GFX12-NEXT: s_or_b32 s5, s7, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v24, v10, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v14, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v13, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v12, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v28, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v29, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo ; GFX12-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 1821d29d4b050..ae6bcb6b08202 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2618,10 +2618,13 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2630,24 +2633,45 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i48: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v2 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v2 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, s4 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v2 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v1, s1, v1, v2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) ret i48 %result } @@ -2677,7 +2701,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: ; return to shader part epilog @@ -2687,7 +2718,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: ; return to shader part epilog @@ -2697,7 +2735,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s2 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s2, s3, s2 +; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -2728,11 +2773,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2740,11 +2787,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2752,10 +2801,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, s1, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) @@ -2787,11 +2839,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2799,11 +2853,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2811,10 +2867,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, v0, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, v1, s1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) @@ -2827,38 +2886,62 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_uaddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i64: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v2 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v2 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v2 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v1, s1, v1, v2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } @@ -2867,28 +2950,56 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s2 -; GFX6-NEXT: s_addc_u32 s1, s1, s3 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s3 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s2 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_or_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s2 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s2, s3, s2 +; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) @@ -2898,37 +3009,46 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-LABEL: uaddsat_i64_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v2 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i64_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i64_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: uaddsat_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, s1, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -2938,37 +3058,46 @@ define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-LABEL: uaddsat_i64_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v2 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i64_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i64_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: uaddsat_i64_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, v0, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, v1, s1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -2979,51 +3108,75 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-LABEL: v_uaddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v0, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v1, v4 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[4:5], v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v4 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v6 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v4 +; GFX10-NEXT: s_or_b32 s4, s4, s6 +; GFX10-NEXT: v_add_co_u32 v3, s7, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, s4 +; GFX10-NEXT: s_or_b32 s4, s5, s7 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3031,12 +3184,18 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-LABEL: v_uaddsat_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v6 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v4 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v2, s2, v2, v6 +; GFX11-NEXT: v_add_co_u32 v3, s1, v3, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX11-NEXT: v_add_co_u32 v1, s2, v1, v4 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: v_add_co_u32 v3, s3, v3, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 +; GFX11-NEXT: s_or_b32 s0, s1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3048,40 +3207,96 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_uaddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_add_u32 s2, s2, s6 -; GFX6-NEXT: s_addc_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_add_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_add_u32 s3, s3, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_add_u32 s2, s2, s6 -; GFX8-NEXT: s_addc_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_add_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_add_u32 s3, s3, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_add_u32 s2, s2, s6 -; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_add_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_add_u32 s3, s3, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_v2i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6 -; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s3, s3, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -3092,8 +3307,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s5 -; GFX6-NEXT: s_addc_u32 s2, s2, s6 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_add_u32 s2, s2, s6 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_add_u32 s2, s2, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_addc_u32 s3, s3, s7 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3102,8 +3329,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-LABEL: s_uaddsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: s_addc_u32 s2, s2, s6 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_add_u32 s2, s2, s6 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_add_u32 s2, s2, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3112,8 +3351,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-LABEL: s_uaddsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_addc_u32 s2, s2, s6 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3122,8 +3373,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10PLUS-LABEL: s_uaddsat_i128: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 -; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s6 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3135,13 +3398,17 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-LABEL: uaddsat_i128_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v4 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], s2, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3150,13 +3417,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX8-LABEL: uaddsat_i128_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v4 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], s2, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3165,13 +3436,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX9-LABEL: uaddsat_i128_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v4 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[4:5], s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3180,9 +3455,15 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX10PLUS-LABEL: uaddsat_i128_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, s1, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s2, s2, v2 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v4 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s0, v2, v4 +; GFX10PLUS-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo @@ -3197,13 +3478,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX6-LABEL: uaddsat_i128_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v4 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], s2, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3212,13 +3497,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i128_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v4 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], s2, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3227,13 +3516,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX9-LABEL: uaddsat_i128_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v4 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[4:5], s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3242,9 +3535,15 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX10PLUS-LABEL: uaddsat_i128_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, v0, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, v1, s1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s2, v2, s2 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v4 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s0, v2, v4 +; GFX10PLUS-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo @@ -3260,17 +3559,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-LABEL: v_uaddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v0, v8 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], v2, v10 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GFX6-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GFX6-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v6, s[6:7], v6, v14 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc @@ -3281,17 +3592,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v8 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v8 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v12 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v8 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v6, s[6:7], v6, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc @@ -3302,17 +3625,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-LABEL: v_uaddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v10, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v8 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[6:7], v6, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 +; GFX9-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc @@ -3323,18 +3658,30 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-LABEL: v_uaddsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 -; GFX10-NEXT: v_add_co_u32 v4, s4, v4, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, v5, v13, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, v6, v14, s4 +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v8 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v10 +; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v12 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v8 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_add_co_u32 v5, s5, v5, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 +; GFX10-NEXT: s_or_b32 vcc_lo, s6, s4 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v14 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4 +; GFX10-NEXT: v_add_co_u32 v5, s4, v5, v8 +; GFX10-NEXT: s_or_b32 s4, s5, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v8 +; GFX10-NEXT: s_or_b32 s4, s6, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, -1, s4 @@ -3344,18 +3691,30 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-LABEL: v_uaddsat_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 -; GFX11-NEXT: v_add_co_u32 v4, s0, v4, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v5, v13, s0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v6, v14, s0 +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v8 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v9 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v2, s2, v2, v10 +; GFX11-NEXT: v_add_co_u32 v4, s3, v4, v12 +; GFX11-NEXT: v_add_co_u32 v1, s1, v1, v8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_add_co_u32 v5, s1, v5, v13 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX11-NEXT: s_or_b32 vcc_lo, s2, s0 +; GFX11-NEXT: v_add_co_u32 v6, s2, v6, v14 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v5, v8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, s0, v6, v8 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, -1, s0 @@ -3369,14 +3728,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_uaddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s9 -; GFX6-NEXT: s_addc_u32 s2, s2, s10 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s9 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_add_u32 s2, s2, s10 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_addc_u32 s3, s3, s11 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: s_add_u32 s4, s4, s12 -; GFX6-NEXT: s_addc_u32 s5, s5, s13 -; GFX6-NEXT: s_addc_u32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_add_u32 s5, s5, s13 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_add_u32 s5, s5, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_add_u32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_add_u32 s6, s6, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_addc_u32 s7, s7, s15 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] @@ -3385,14 +3768,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-LABEL: s_uaddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s8 -; GFX8-NEXT: s_addc_u32 s1, s1, s9 -; GFX8-NEXT: s_addc_u32 s2, s2, s10 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s9 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_add_u32 s2, s2, s10 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_add_u32 s2, s2, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: s_add_u32 s4, s4, s12 -; GFX8-NEXT: s_addc_u32 s5, s5, s13 -; GFX8-NEXT: s_addc_u32 s6, s6, s14 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s5, s5, s13 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_add_u32 s5, s5, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_add_u32 s6, s6, s14 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_add_u32 s6, s6, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_addc_u32 s7, s7, s15 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX8-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] @@ -3401,14 +3808,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-LABEL: s_uaddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s9 -; GFX9-NEXT: s_addc_u32 s2, s2, s10 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s9 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: s_add_u32 s4, s4, s12 -; GFX9-NEXT: s_addc_u32 s5, s5, s13 -; GFX9-NEXT: s_addc_u32 s6, s6, s14 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s5, s5, s13 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s6, s6, s14 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_add_u32 s6, s6, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_addc_u32 s7, s7, s15 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] @@ -3417,14 +3848,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10PLUS-LABEL: s_uaddsat_v2i128: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9 -; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12 -; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13 -; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s5, s5, s13 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s5, s5, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_add_u32 s6, s6, s14 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_add_u32 s6, s6, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] From cd4e2466163c46c4e731f8dfc77a9b6673d26c89 Mon Sep 17 00:00:00 2001 From: Lucile Rose Nihlen Date: Tue, 20 Feb 2024 15:30:38 -0500 Subject: [PATCH 002/351] repair and re-enable Windows buildkite presubmit (#82393) --- .ci/generate-buildkite-pipeline-premerge | 5 +---- .ci/monolithic-windows.sh | 11 ++++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge index c14ec464a43a6..4ebf304e23d58 100755 --- a/.ci/generate-buildkite-pipeline-premerge +++ b/.ci/generate-buildkite-pipeline-premerge @@ -233,10 +233,7 @@ linux_projects=$(add-dependencies ${linux_projects_to_test} | sort | uniq) windows_projects_to_test=$(exclude-windows $(compute-projects-to-test ${modified_projects})) windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq) -# Temporary disable the windows job. -# See https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840 -#windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq) -windows_projects="" +windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq) # Generate the appropriate pipeline if [[ "${linux_projects}" != "" ]]; then diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index a704e855f011c..9561bf668a90c 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -38,6 +38,12 @@ targets="${2}" echo "--- cmake" pip install -q -r ${MONOREPO_ROOT}/mlir/python/requirements.txt + +# The CMAKE_*_LINKER_FLAGS to disable the manifest come from research +# on fixing a build reliability issue on the build server, please +# see https://github.com/llvm/llvm-project/pull/82393 and +# https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40 +# for further information. cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ @@ -49,7 +55,10 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \ -D COMPILER_RT_BUILD_ORC=OFF \ -D CMAKE_C_COMPILER_LAUNCHER=sccache \ -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \ - -D MLIR_ENABLE_BINDINGS_PYTHON=ON + -D MLIR_ENABLE_BINDINGS_PYTHON=ON \ + -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \ + -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \ + -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" echo "--- ninja" # Targets are not escaped as they are passed as separate arguments. From a468d02fe9e544f39f6c0428c23b2396df6a35ff Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Tue, 20 Feb 2024 12:33:08 -0800 Subject: [PATCH 003/351] [flang][runtime] Add FortranFloat128Math wrapper library. (#81971) Implemented few entry points for REAL(16) math in FortranF128Math static library. It is a thin wrapper around GNU libquadmath. Flang driver can always link it, and the dependencies will be brought in as needed. The final Fortran program/library that uses any of the entry points will depend on the underlying third-party library - this dependency has to be resolved somehow. I added FLANG_RUNTIME_F128_MATH_LIB CMake control so that the compiler driver and the runtime library can be built using the same third-party library: this way the linker knows which dependency to link in (under --as-needed). The compiler distribution should specify which third-party library is required for linking/running the apps that use REAL(16). The compiler package may provide a version of the third-party library or at least a stub library that can be used for linking, but the final program execution will still require the actual library. --- clang/include/clang/Driver/Driver.h | 10 ++ clang/lib/Driver/ToolChains/CommonArgs.cpp | 8 ++ flang/CMakeLists.txt | 17 +++ .../flang/Optimizer/Builder/IntrinsicCall.h | 19 ++-- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 101 ++++++++++++------ flang/runtime/CMakeLists.txt | 20 ++++ flang/runtime/Float128Math/CMakeLists.txt | 56 ++++++++++ flang/runtime/Float128Math/cabs.cpp | 24 +++++ flang/runtime/Float128Math/math-entries.h | 77 +++++++++++++ flang/runtime/Float128Math/sin.cpp | 22 ++++ flang/runtime/Float128Math/sqrt.cpp | 22 ++++ .../Lower/Intrinsics/missing-math-runtime.f90 | 6 +- flang/tools/flang-driver/driver.cpp | 3 + 13 files changed, 345 insertions(+), 40 deletions(-) create mode 100644 flang/runtime/Float128Math/CMakeLists.txt create mode 100644 flang/runtime/Float128Math/cabs.cpp create mode 100644 flang/runtime/Float128Math/math-entries.h create mode 100644 flang/runtime/Float128Math/sin.cpp create mode 100644 flang/runtime/Float128Math/sqrt.cpp diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 908bc87c14b1c..a5ca637853a6a 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -251,6 +251,11 @@ class Driver { /// from non-system headers are emitted. HeaderIncludeFilteringKind CCPrintHeadersFiltering = HIFIL_None; + /// Name of the library that provides implementations of + /// IEEE-754 128-bit float math functions used by Fortran F128 + /// runtime library. It should be linked as needed by the linker job. + std::string FlangF128MathLibrary; + /// Set CC_LOG_DIAGNOSTICS mode, which causes the frontend to log diagnostics /// to CCLogDiagnosticsFilename or to stderr, in a stable machine readable /// format. @@ -440,6 +445,11 @@ class Driver { bool offloadHostOnly() const { return Offload == OffloadHost; } bool offloadDeviceOnly() const { return Offload == OffloadDevice; } + void setFlangF128MathLibrary(std::string name) { + FlangF128MathLibrary = std::move(name); + } + StringRef getFlangF128MathLibrary() const { return FlangF128MathLibrary; } + /// Compute the desired OpenMP runtime from the flags provided. OpenMPRuntimeKind getOpenMPRuntime(const llvm::opt::ArgList &Args) const; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0fd7b8424eb4b..e5196bd8b5ae9 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1285,6 +1285,14 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args, // add the correct libraries to link against as dependents in the object // file. if (!TC.getTriple().isKnownWindowsMSVCEnvironment()) { + StringRef f128LibName = TC.getDriver().getFlangF128MathLibrary(); + f128LibName.consume_front_insensitive("lib"); + if (!f128LibName.empty()) { + CmdArgs.push_back("-lFortranFloat128Math"); + addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true); + CmdArgs.push_back(Args.MakeArgString("-l" + f128LibName)); + addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false); + } CmdArgs.push_back("-lFortranRuntime"); CmdArgs.push_back("-lFortranDecimal"); } diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index f8ad39ba712f8..21617aeea0215 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -33,6 +33,17 @@ endif() option(FLANG_ENABLE_WERROR "Fail and stop building flang if a warning is triggered." OFF) +# The out of tree builds of the compiler and the Fortran runtime +# must use the same setting of FLANG_RUNTIME_F128_MATH_LIB +# to be composable. Failure to synchronize this setting may result +# in linking errors or fatal failures in F128 runtime functions. +set(FLANG_RUNTIME_F128_MATH_LIB "" CACHE STRING + "Specifies the target library used for implementing IEEE-754 128-bit float \ + math in F18 runtime, e.g. it might be libquadmath for targets where \ + REAL(16) is mapped to __float128, or libm for targets where REAL(16) \ + is mapped to long double, etc." + ) + # Check for a standalone build and configure as appropriate from # there. if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) @@ -321,6 +332,12 @@ if (FLANG_REPOSITORY_STRING) add_definitions(-DFLANG_REPOSITORY_STRING="${FLANG_REPOSITORY_STRING}") endif() +if (FLANG_RUNTIME_F128_MATH_LIB) + add_compile_definitions( + -DFLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}" + ) +endif() + include(TestBigEndian) test_big_endian(IS_BIGENDIAN) if (IS_BIGENDIAN) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 3f1e22ecca4cc..7cb99d61a686e 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -494,12 +494,13 @@ struct RuntimeFunction { fir::runtime::FuncTypeBuilderFunc typeGenerator; }; -/// Callback type for generating lowering for a math operation. -using MathGeneratorTy = mlir::Value (*)(fir::FirOpBuilder &, mlir::Location, - llvm::StringRef, mlir::FunctionType, - llvm::ArrayRef); - struct MathOperation { + // Callback type for generating lowering for a math operation. + using MathGeneratorTy = mlir::Value (*)(fir::FirOpBuilder &, mlir::Location, + const MathOperation &, + mlir::FunctionType, + llvm::ArrayRef); + // Overrides fir::runtime::FuncTypeBuilderFunc to add FirOpBuilder argument. using FuncTypeBuilderFunc = mlir::FunctionType (*)(mlir::MLIRContext *, fir::FirOpBuilder &); @@ -681,25 +682,25 @@ getTypesForArgs(llvm::ArrayRef args) { } mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef libFuncName, + const MathOperation &mathOp, mlir::FunctionType libFuncType, llvm::ArrayRef args); template mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef mathLibFuncName, + const MathOperation &mathOp, mlir::FunctionType mathLibFuncType, llvm::ArrayRef args); template mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef mathLibFuncName, + const MathOperation &mathOp, mlir::FunctionType mathLibFuncType, llvm::ArrayRef args); mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef libFuncName, + const MathOperation &mathOp, mlir::FunctionType libFuncType, llvm::ArrayRef args); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index a3536895ca3b7..3a82be895d37c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -657,10 +657,61 @@ static llvm::cl::opt "instead of libm complex operations"), llvm::cl::init(false)); +/// Return a string containing the given Fortran intrinsic name +/// with the type of its arguments specified in funcType +/// surrounded by the given prefix/suffix. +static std::string +prettyPrintIntrinsicName(fir::FirOpBuilder &builder, mlir::Location loc, + llvm::StringRef prefix, llvm::StringRef name, + llvm::StringRef suffix, mlir::FunctionType funcType) { + std::string output = prefix.str(); + llvm::raw_string_ostream sstream(output); + if (name == "pow") { + assert(funcType.getNumInputs() == 2 && "power operator has two arguments"); + std::string displayName{" ** "}; + sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc, + displayName) + << displayName + << numericMlirTypeToFortran(builder, funcType.getInput(1), loc, + displayName); + } else { + sstream << name.upper() << "("; + if (funcType.getNumInputs() > 0) + sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc, + name); + for (mlir::Type argType : funcType.getInputs().drop_front()) { + sstream << ", " << numericMlirTypeToFortran(builder, argType, loc, name); + } + sstream << ")"; + } + sstream << suffix; + return output; +} + +// Generate a call to the Fortran runtime library providing +// support for 128-bit float math via a third-party library. +// If the compiler is built without FLANG_RUNTIME_F128_MATH_LIB, +// this function will report an error. +static mlir::Value genLibF128Call(fir::FirOpBuilder &builder, + mlir::Location loc, + const MathOperation &mathOp, + mlir::FunctionType libFuncType, + llvm::ArrayRef args) { +#ifndef FLANG_RUNTIME_F128_MATH_LIB + std::string message = prettyPrintIntrinsicName( + builder, loc, "compiler is built without support for '", mathOp.key, "'", + libFuncType); + fir::emitFatalError(loc, message, /*genCrashDiag=*/false); +#else // FLANG_RUNTIME_F128_MATH_LIB + return genLibCall(builder, loc, mathOp, libFuncType, args); +#endif // FLANG_RUNTIME_F128_MATH_LIB +} + mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef libFuncName, + const MathOperation &mathOp, mlir::FunctionType libFuncType, llvm::ArrayRef args) { + llvm::StringRef libFuncName = mathOp.runtimeFunc; LLVM_DEBUG(llvm::dbgs() << "Generating '" << libFuncName << "' call with type "; libFuncType.dump(); llvm::dbgs() << "\n"); @@ -718,7 +769,7 @@ mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef libFuncName, + const MathOperation &mathOp, mlir::FunctionType libFuncType, llvm::ArrayRef args) { assert(args.size() == 2 && "Incorrect #args to genLibSplitComplexArgsCall"); @@ -762,13 +813,12 @@ mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder, cplx2, /*isImagPart=*/true); splitArgs.push_back(imag2); - return genLibCall(builder, loc, libFuncName, getSplitComplexArgsType(), - splitArgs); + return genLibCall(builder, loc, mathOp, getSplitComplexArgsType(), splitArgs); } template mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef mathLibFuncName, + const MathOperation &mathOp, mlir::FunctionType mathLibFuncType, llvm::ArrayRef args) { // TODO: we have to annotate the math operations with flags @@ -791,13 +841,14 @@ mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc, // can be also lowered to libm calls for "fast" and "relaxed" // modes. mlir::Value result; + llvm::StringRef mathLibFuncName = mathOp.runtimeFunc; if (mathRuntimeVersion == preciseVersion && // Some operations do not have to be lowered as conservative // calls, since they do not affect strict FP behavior. // For example, purely integer operations like exponentiation // with integer operands fall into this class. !mathLibFuncName.empty()) { - result = genLibCall(builder, loc, mathLibFuncName, mathLibFuncType, args); + result = genLibCall(builder, loc, mathOp, mathLibFuncType, args); } else { LLVM_DEBUG(llvm::dbgs() << "Generating '" << mathLibFuncName << "' operation with type "; @@ -810,7 +861,7 @@ mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc, template mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef mathLibFuncName, + const MathOperation &mathOp, mlir::FunctionType mathLibFuncType, llvm::ArrayRef args) { mlir::Value result; @@ -819,11 +870,12 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc, // If we have libm functions, we can attempt to generate the more precise // version of the complex math operation. + llvm::StringRef mathLibFuncName = mathOp.runtimeFunc; if (!mathLibFuncName.empty()) { // If we enabled MLIR complex or can use approximate operations, we should // NOT use libm. if (!forceMlirComplex && !canUseApprox) { - result = genLibCall(builder, loc, mathLibFuncName, mathLibFuncType, args); + result = genLibCall(builder, loc, mathOp, mathLibFuncType, args); LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n"); return result; } @@ -863,6 +915,10 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc, /// TODO: support remaining Fortran math intrinsics. /// See https://gcc.gnu.org/onlinedocs/gcc-12.1.0/gfortran/\ /// Intrinsic-Procedures.html for a reference. +constexpr auto FuncTypeReal16Real16 = genFuncType, Ty::Real<16>>; +constexpr auto FuncTypeReal16Complex16 = + genFuncType, Ty::Complex<16>>; + static constexpr MathOperation mathOperations[] = { {"abs", "fabsf", genFuncType, Ty::Real<4>>, genMathOp}, @@ -874,6 +930,7 @@ static constexpr MathOperation mathOperations[] = { genComplexMathOp}, {"abs", "cabs", genFuncType, Ty::Complex<8>>, genComplexMathOp}, + {"abs", RTNAME_STRING(CAbsF128), FuncTypeReal16Complex16, genLibF128Call}, {"acos", "acosf", genFuncType, Ty::Real<4>>, genLibCall}, {"acos", "acos", genFuncType, Ty::Real<8>>, genLibCall}, {"acos", "cacosf", genFuncType, Ty::Complex<4>>, genLibCall}, @@ -1110,6 +1167,7 @@ static constexpr MathOperation mathOperations[] = { genMathOp}, {"sin", "sin", genFuncType, Ty::Real<8>>, genMathOp}, + {"sin", RTNAME_STRING(SinF128), FuncTypeReal16Real16, genLibF128Call}, {"sin", "csinf", genFuncType, Ty::Complex<4>>, genComplexMathOp}, {"sin", "csin", genFuncType, Ty::Complex<8>>, @@ -1122,6 +1180,7 @@ static constexpr MathOperation mathOperations[] = { genMathOp}, {"sqrt", "sqrt", genFuncType, Ty::Real<8>>, genMathOp}, + {"sqrt", RTNAME_STRING(SqrtF128), FuncTypeReal16Real16, genLibF128Call}, {"sqrt", "csqrtf", genFuncType, Ty::Complex<4>>, genComplexMathOp}, {"sqrt", "csqrt", genFuncType, Ty::Complex<8>>, @@ -1345,27 +1404,9 @@ static void checkPrecisionLoss(llvm::StringRef name, // lowering and could be used here. Emit an error and continue // generating the code with the narrowing cast so that the user // can get a complete list of the problematic intrinsic calls. - std::string message("not yet implemented: no math runtime available for '"); - llvm::raw_string_ostream sstream(message); - if (name == "pow") { - assert(funcType.getNumInputs() == 2 && "power operator has two arguments"); - std::string displayName{" ** "}; - sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc, - displayName) - << displayName - << numericMlirTypeToFortran(builder, funcType.getInput(1), loc, - displayName); - } else { - sstream << name.upper() << "("; - if (funcType.getNumInputs() > 0) - sstream << numericMlirTypeToFortran(builder, funcType.getInput(0), loc, - name); - for (mlir::Type argType : funcType.getInputs().drop_front()) { - sstream << ", " << numericMlirTypeToFortran(builder, argType, loc, name); - } - sstream << ")"; - } - sstream << "'"; + std::string message = prettyPrintIntrinsicName( + builder, loc, "not yet implemented: no math runtime available for '", + name, "'", funcType); mlir::emitError(loc, message); } @@ -1887,7 +1928,7 @@ IntrinsicLibrary::getRuntimeCallGenerator(llvm::StringRef name, for (auto [fst, snd] : llvm::zip(actualFuncType.getInputs(), args)) convertedArguments.push_back(builder.createConvert(loc, fst, snd)); mlir::Value result = mathOp->funcGenerator( - builder, loc, mathOp->runtimeFunc, actualFuncType, convertedArguments); + builder, loc, *mathOp, actualFuncType, convertedArguments); mlir::Type soughtType = soughtFuncType.getResult(0); return builder.createConvert(loc, soughtType, result); }; diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt index dfa9da502db0a..ac89184a7cbff 100644 --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -46,6 +46,23 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) endif () include_directories(BEFORE ${FLANG_SOURCE_DIR}/include) + + # The out of tree builds of the compiler and the Fortran runtime + # must use the same setting of FLANG_RUNTIME_F128_MATH_LIB + # to be composable. Failure to synchronize this setting may result + # in linking errors or fatal failures in F128 runtime functions. + set(FLANG_RUNTIME_F128_MATH_LIB "" CACHE STRING + "Specifies the target library used for implementing IEEE-754 128-bit float \ + math in F18 runtime, e.g. it might be libquadmath for targets where \ + REAL(16) is mapped to __float128, or libm for targets where REAL(16) \ + is mapped to long double, etc." + ) + + if (NOT FLANG_RUNTIME_F128_MATH_LIB STREQUAL "") + add_compile_definitions( + -DFLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}" + ) + endif() endif() include(CheckCXXSymbolExists) @@ -83,6 +100,9 @@ add_definitions(-U_GLIBCXX_ASSERTIONS) add_definitions(-U_LIBCPP_ENABLE_ASSERTIONS) add_subdirectory(FortranMain) +if (NOT ${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "") + add_subdirectory(Float128Math) +endif() set(sources ISO_Fortran_binding.cpp diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt new file mode 100644 index 0000000000000..f8da4d7ca1a9f --- /dev/null +++ b/flang/runtime/Float128Math/CMakeLists.txt @@ -0,0 +1,56 @@ +#===-- runtime/Float128Math/CMakeLists.txt ---------------------------------===# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===------------------------------------------------------------------------===# + +# FortranFloat128 implements IEEE-754 128-bit float math functions. +# It is a thin wapper and it currently relies on third-party +# libraries available for the target. +# It is distributed as a static library only. +# Fortran programs/libraries that end up linking any of the provided +# will have a dependency on the third-party library that is being +# used for building this FortranFloat128Math library. + +if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath" OR + ${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "quadmath") + check_include_file(quadmath.h FOUND_QUADMATH_HEADER) + if(FOUND_QUADMATH_HEADER) + add_compile_definitions(HAS_QUADMATHLIB) + else() + message(FATAL_ERROR + "FLANG_RUNTIME_F128_MATH_LIB setting requires quadmath.h " + "to be available: ${FLANG_RUNTIME_F128_MATH_LIB}" + ) + endif() +else() + message(FATAL_ERROR + "Unsupported third-party library for Fortran F128 math runtime: " + "${FLANG_RUNTIME_F128_MATH_LIB}" + ) +endif() + +set(sources + cabs.cpp + sin.cpp + sqrt.cpp + ) + +include_directories(AFTER "${CMAKE_CURRENT_SOURCE_DIR}/..") +add_flang_library(FortranFloat128Math STATIC INSTALL_WITH_TOOLCHAIN ${sources}) + +if (DEFINED MSVC) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) + add_flang_library(FortranFloat128Math.static STATIC INSTALL_WITH_TOOLCHAIN + ${sources} + ) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug) + add_flang_library(FortranFloat128Math.static_dbg STATIC INSTALL_WITH_TOOLCHAIN + ${sources} + ) + add_dependencies(FortranFloat128Math FortranFloat128Math.static + FortranFloat128Math.static_dbg + ) +endif() diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp new file mode 100644 index 0000000000000..63f2bdf8e177a --- /dev/null +++ b/flang/runtime/Float128Math/cabs.cpp @@ -0,0 +1,24 @@ +//===-- runtime/Float128Math/cabs.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "math-entries.h" + +namespace Fortran::runtime { +extern "C" { + +#if LDBL_MANT_DIG == 113 || HAS_FLOAT128 +// FIXME: the argument should be CppTypeFor, +// and it should be translated into the underlying library's +// corresponding complex128 type. +CppTypeFor RTDEF(CAbsF128)(ComplexF128 x) { + return CAbs::invoke(x); +} +#endif + +} // extern "C" +} // namespace Fortran::runtime diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h new file mode 100644 index 0000000000000..91c14b008b576 --- /dev/null +++ b/flang/runtime/Float128Math/math-entries.h @@ -0,0 +1,77 @@ +//===-- runtime/Float128Math/math-entries.h ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_ +#define FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_ +#include "terminator.h" +#include "tools.h" +#include "flang/Common/float128.h" +#include "flang/Runtime/entry-names.h" +#include + +namespace Fortran::runtime { + +// Define a class template to gracefully fail, when +// there is no specialized template that implements +// the required function via using the third-party +// implementation. +#define DEFINE_FALLBACK(caller) \ + template struct caller { \ + template \ + [[noreturn]] static std::invoke_result_t invoke( \ + ATs... args) { \ + Terminator terminator{__FILE__, __LINE__}; \ + terminator.Crash("Float128 variant of '%s' is unsupported", #caller); \ + } \ + }; + +// Define template specialization that is calling the third-party +// implementation. The template is specialized by a function pointer +// that is the FortranFloat128Math entry point. The signatures +// of the caller and the callee must match. +// +// Defining the specialization for any target library requires +// adding the generic template via DEFINE_FALLBACK, so that +// a build with another target library that does not define +// the same alias can gracefully fail in runtime. +#define DEFINE_SIMPLE_ALIAS(caller, callee) \ + template struct caller

{ \ + static RT invoke(ATs... args) { \ + static_assert(std::is_invocable_r_v); \ + if constexpr (std::is_same_v) { \ + callee(args...); \ + } else { \ + return callee(args...); \ + } \ + } \ + }; + +// Define fallback callers. +DEFINE_FALLBACK(CAbs) +DEFINE_FALLBACK(Sin) +DEFINE_FALLBACK(Sqrt) + +// Define ComplexF128 type that is compatible with +// the type of results/arguments of libquadmath. +// TODO: this may need more work for other libraries/compilers. +#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__) +typedef _Complex float __attribute__((mode(TC))) ComplexF128; +#else +typedef _Complex float __attribute__((mode(KC))) ComplexF128; +#endif + +#if HAS_QUADMATHLIB +// Define wrapper callers for libquadmath. +#include "quadmath.h" +DEFINE_SIMPLE_ALIAS(CAbs, cabsq) +DEFINE_SIMPLE_ALIAS(Sin, sinq) +DEFINE_SIMPLE_ALIAS(Sqrt, sqrtq) +#endif +} // namespace Fortran::runtime + +#endif // FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_ diff --git a/flang/runtime/Float128Math/sin.cpp b/flang/runtime/Float128Math/sin.cpp new file mode 100644 index 0000000000000..013eb9d119a6a --- /dev/null +++ b/flang/runtime/Float128Math/sin.cpp @@ -0,0 +1,22 @@ +//===-- runtime/Float128Math/sin.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "math-entries.h" + +namespace Fortran::runtime { +extern "C" { + +#if LDBL_MANT_DIG == 113 || HAS_FLOAT128 +CppTypeFor RTDEF(SinF128)( + CppTypeFor x) { + return Sin::invoke(x); +} +#endif + +} // extern "C" +} // namespace Fortran::runtime diff --git a/flang/runtime/Float128Math/sqrt.cpp b/flang/runtime/Float128Math/sqrt.cpp new file mode 100644 index 0000000000000..aafbd850ca973 --- /dev/null +++ b/flang/runtime/Float128Math/sqrt.cpp @@ -0,0 +1,22 @@ +//===-- runtime/Float128Math/sqrt.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "math-entries.h" + +namespace Fortran::runtime { +extern "C" { + +#if LDBL_MANT_DIG == 113 || HAS_FLOAT128 +CppTypeFor RTDEF(SqrtF128)( + CppTypeFor x) { + return Sqrt::invoke(x); +} +#endif + +} // extern "C" +} // namespace Fortran::runtime diff --git a/flang/test/Lower/Intrinsics/missing-math-runtime.f90 b/flang/test/Lower/Intrinsics/missing-math-runtime.f90 index 98d3abb17f3a8..ff767ba18faae 100644 --- a/flang/test/Lower/Intrinsics/missing-math-runtime.f90 +++ b/flang/test/Lower/Intrinsics/missing-math-runtime.f90 @@ -1,10 +1,14 @@ ! There is no quad math runtime available in lowering ! for now. Test that the TODO are emitted correctly. +! FIXME: the lit config has to flip a feature flag so that +! the tests can use different checks depending on whether +! REAL(16) math support is enabled or not. +! XFAIL: * ! RUN: bbc -emit-fir %s -o /dev/null 2>&1 | FileCheck %s complex(16) :: a real(16) :: b -! CHECK: not yet implemented: no math runtime available for 'ABS(COMPLEX(KIND=16))' +! CHECK: compiler is built without support for 'ABS(COMPLEX(KIND=16))' b = abs(a) end diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp index c4e56a862c861..52136df10c0b0 100644 --- a/flang/tools/flang-driver/driver.cpp +++ b/flang/tools/flang-driver/driver.cpp @@ -130,6 +130,9 @@ int main(int argc, const char **argv) { llvm::sys::getDefaultTargetTriple(), diags, "flang LLVM compiler"); theDriver.setTargetAndMode(targetandMode); +#ifdef FLANG_RUNTIME_F128_MATH_LIB + theDriver.setFlangF128MathLibrary(FLANG_RUNTIME_F128_MATH_LIB); +#endif std::unique_ptr c( theDriver.BuildCompilation(args)); llvm::SmallVector, 4> From ed4bdb86b084bf633770136d005426adeeb2cd57 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 20 Feb 2024 12:38:23 -0800 Subject: [PATCH 004/351] [libc][__support][bit] add count_zeros (#82076) Will be useful for implementing C23 stdbit.h's stdc_count_zeros and stdc_count_ones. --- libc/src/__support/CPP/bit.h | 29 ++++++++++++++++++++++++ libc/test/src/__support/CPP/bit_test.cpp | 13 +++++++++++ 2 files changed, 42 insertions(+) diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h index f5e50262371f2..7d11e7d5c497e 100644 --- a/libc/src/__support/CPP/bit.h +++ b/libc/src/__support/CPP/bit.h @@ -248,6 +248,35 @@ template >> return value == cpp::numeric_limits::max() ? 0 : countr_zero(value) + 1; } +/// Count number of 1's aka population count or hamming weight. +/// +/// Only unsigned integral types are allowed. +template >> +[[nodiscard]] LIBC_INLINE constexpr int count_ones(T value) { + int count = 0; + for (int i = 0; i != cpp::numeric_limits::digits; ++i) + if ((value >> i) & 0x1) + ++count; + return count; +} +#define ADD_SPECIALIZATION(TYPE, BUILTIN) \ + template <> \ + [[nodiscard]] LIBC_INLINE constexpr int count_ones(TYPE value) { \ + return BUILTIN(value); \ + } +ADD_SPECIALIZATION(unsigned char, __builtin_popcount) +ADD_SPECIALIZATION(unsigned short, __builtin_popcount) +ADD_SPECIALIZATION(unsigned, __builtin_popcount) +ADD_SPECIALIZATION(unsigned long, __builtin_popcountl) +ADD_SPECIALIZATION(unsigned long long, __builtin_popcountll) +// TODO: 128b specializations? +#undef ADD_SPECIALIZATION + +template >> +[[nodiscard]] LIBC_INLINE constexpr int count_zeros(T value) { + return count_ones(static_cast(~value)); +} + } // namespace LIBC_NAMESPACE::cpp #endif // LLVM_LIBC_SRC___SUPPORT_CPP_BIT_H diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp index 5d1f451776a5f..115a5d505c4b7 100644 --- a/libc/test/src/__support/CPP/bit_test.cpp +++ b/libc/test/src/__support/CPP/bit_test.cpp @@ -232,4 +232,17 @@ TYPED_TEST(LlvmLibcBitTest, FirstTrailingOne, UnsignedTypes) { EXPECT_EQ(first_trailing_one(T(1) << i), i + 1); } +TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypes) { + EXPECT_EQ(count_zeros(T(0)), cpp::numeric_limits::digits); + for (int i = 0; i != cpp::numeric_limits::digits; ++i) + EXPECT_EQ(count_zeros(cpp::numeric_limits::max() >> i), i); +} + +TYPED_TEST(LlvmLibcBitTest, CountOnes, UnsignedTypes) { + EXPECT_EQ(count_ones(T(0)), 0); + for (int i = 0; i != cpp::numeric_limits::digits; ++i) + EXPECT_EQ(count_ones(cpp::numeric_limits::max() >> i), + cpp::numeric_limits::digits - i); +} + } // namespace LIBC_NAMESPACE::cpp From f804e2badf30321121df4d0d7df8e32e10f134cc Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 20 Feb 2024 12:53:39 -0800 Subject: [PATCH 005/351] [ELF] .eh_frame: use errorOrWarn for "PC offset is too large" errorOrWarn is more conventional for recoverable errors. This error message does not have to use `fatal`, and we try to remove such uses in parallel code paths. --- lld/ELF/SyntheticSections.cpp | 8 +++++--- lld/test/ELF/eh-frame-pcrel-overflow.s | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index bada394aa30d7..b6bdc350bc0dd 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -537,9 +537,11 @@ SmallVector EhFrameSection::getFdeData() const { for (EhSectionPiece *fde : rec->fdes) { uint64_t pc = getFdePc(buf, fde->outputOff, enc); uint64_t fdeVA = getParent()->addr + fde->outputOff; - if (!isInt<32>(pc - va)) - fatal(toString(fde->sec) + ": PC offset is too large: 0x" + - Twine::utohexstr(pc - va)); + if (!isInt<32>(pc - va)) { + errorOrWarn(toString(fde->sec) + ": PC offset is too large: 0x" + + Twine::utohexstr(pc - va)); + continue; + } ret.push_back({uint32_t(pc - va), uint32_t(fdeVA - va)}); } } diff --git a/lld/test/ELF/eh-frame-pcrel-overflow.s b/lld/test/ELF/eh-frame-pcrel-overflow.s index 78e804768dad6..3dfcf9ee1a7f9 100644 --- a/lld/test/ELF/eh-frame-pcrel-overflow.s +++ b/lld/test/ELF/eh-frame-pcrel-overflow.s @@ -4,7 +4,9 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %p/Inputs/eh-frame-pcrel-overflow.s -o %t1.o # RUN: ld.lld --eh-frame-hdr -Ttext=0x90000000 %t.o -o /dev/null # RUN: not ld.lld --eh-frame-hdr %t.o %t1.o -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --eh-frame-hdr %t.o %t1.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=WARN # CHECK: error: {{.*}}.o:(.eh_frame): PC offset is too large: 0x90001054 +# WARN: warning: {{.*}}.o:(.eh_frame): PC offset is too large: 0x90001054 .text .global _start From bb029a5c039766ef83c88a456cf936cec0a1a69b Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 20 Feb 2024 15:56:12 -0500 Subject: [PATCH 006/351] [gn] port 4c6043de0b83 (InstallAPITests) --- .../gn/secondary/clang/lib/ExtractAPI/BUILD.gn | 1 + .../gn/secondary/clang/lib/InstallAPI/BUILD.gn | 6 +++++- llvm/utils/gn/secondary/clang/unittests/BUILD.gn | 1 + .../secondary/clang/unittests/InstallAPI/BUILD.gn | 13 +++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn index 62b4af0635841..ee60eee0da0fb 100644 --- a/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn @@ -6,6 +6,7 @@ static_library("ExtractAPI") { "//clang/lib/Basic", "//clang/lib/Frontend", "//clang/lib/Index", + "//clang/lib/InstallAPI", "//llvm/lib/Support", "//llvm/lib/TargetParser", ] diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn index 4d79ac805ac19..6eae7e293dce6 100644 --- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn @@ -6,5 +6,9 @@ static_library("InstallAPI") { "//llvm/lib/Support", "//llvm/lib/TextAPI", ] - sources = [ "Context.cpp" ] + sources = [ + "Context.cpp", + "FileList.cpp", + "HeaderFile.cpp", + ] } diff --git a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn index b60c5264d60cc..354934f4b18ab 100644 --- a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn @@ -13,6 +13,7 @@ group("unittests") { "Format:FormatTests", "Frontend:FrontendTests", "Index:IndexTests", + "InstallAPI:InstallAPITests", "Interpreter:ClangReplInterpreterTests", "Introspection:IntrospectionTests", "Lex:LexTests", diff --git a/llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn new file mode 100644 index 0000000000000..e27659457474f --- /dev/null +++ b/llvm/utils/gn/secondary/clang/unittests/InstallAPI/BUILD.gn @@ -0,0 +1,13 @@ +import("//third-party/unittest/unittest.gni") + +unittest("InstallAPITests") { + configs += [ "//llvm/utils/gn/build:clang_code" ] + deps = [ + "//clang/lib/InstallAPI", + "//llvm/lib/Testing/Support", + ] + sources = [ + "HeaderFileTest.cpp", + "FileListTest.cpp", + ] +} From d6850be44d2bfcd79d31fede3b8018357416da03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Wed, 21 Feb 2024 07:59:43 +1100 Subject: [PATCH 007/351] [mlir][linalg] Add e2e test for linalg.mmt4d (#81790) Follow-up for #81422. My intention is to write an e2e test targetting SVE, but more work is needed. Sending this as an intermiedate step. --- .../Integration/Dialect/Linalg/CPU/mmt4d.mlir | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir new file mode 100644 index 0000000000000..8ee4e1fb48fef --- /dev/null +++ b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir @@ -0,0 +1,121 @@ +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ +// DEFINE: -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t +// DEFINE: %{entry_point} = mmt4d +// DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils + +// RUN: %{compile} + +// RUN: %{run} | FileCheck %s + +func.func @mmt4d() { + // Allocate the matrices + %A_alloc = tensor.empty() : tensor<2x2x3x1xi32> + %B_alloc = tensor.empty() : tensor<2x2x3x1xi32> + %C_alloc = tensor.empty() : tensor<2x2x3x3xi32> + %C_in = arith.constant dense<[ + [[[ 1, 2, 3], + [ 4, 5, 6], + [ 7, 8, 9]], + [[ 11, 12, 13], + [ 14, 15, 16], + [ 17, 18, 19]]], + [[[ 21, 22, 23], + [ 24, 25, 26], + [ 27, 28, 29]], + [[ 31, 32, 33], + [ 34, 35, 36], + [ 37, 38, 39]]] + ]> : tensor<2x2x3x3xi32> + + // Initialise the matrices + %three = arith.constant 3 : i32 + %four = arith.constant 4 : i32 + %A = linalg.fill ins(%three : i32) outs(%A_alloc : tensor<2x2x3x1xi32>) -> tensor<2x2x3x1xi32> + %B = linalg.fill ins(%four : i32) outs(%B_alloc : tensor<2x2x3x1xi32>) -> tensor<2x2x3x1xi32> + + // Matmul + %C_out = linalg.mmt4d ins(%A, %B: tensor<2x2x3x1xi32>, tensor<2x2x3x1xi32>) outs(%C_in: tensor<2x2x3x3xi32>) -> tensor<2x2x3x3xi32> + + // Print and verify the output + // CHECK: Unranked Memref {{.*}} rank = 4 offset = 0 sizes = [2, 2, 3, 3] strides = [18, 9, 3, 1] data = + // C[0, 0] + // CHECK-NEXT: [25, 26, 27] + // CHECK-NEXT: [28, 29, 30] + // CHECK-NEXT: [31, 32, 33] + // C[0, 1] + // CHECK-NEXT: [35, 36, 37] + // CHECK-NEXT: [38, 39, 40] + // CHECK-NEXT: [41, 42, 43] + // C[1, 0] + // CHECK-NEXT: [45, 46, 47] + // CHECK-NEXT: [48, 49, 50] + // CHECK-NEXT: [51, 52, 53] + // C[1, 1] + // CHECK-NEXT: [55, 56, 57] + // CHECK-NEXT: [58, 59, 60] + // CHECK-NEXT: [61, 62, 63] + + %xf = tensor.cast %C_out : tensor<2x2x3x3xi32> to tensor<*xi32> + call @printMemrefI32(%xf) : (tensor<*xi32>) -> () + + return +} + +module @transforms attributes { transform.with_named_sequence } { + transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { + %mmt4d = transform.collect_matching @match_mmt4d in %module : (!transform.any_op) -> (!transform.any_op) + %func = transform.get_parent_op %mmt4d {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func"> + + // Step 1: Tile + // Tile parallel dims + %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 3, 3, 0] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + // Tile reduction dims + %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + // Step 2: Vectorize + transform.structured.vectorize %tiled_linalg_op_r : !transform.any_op + + // Step 3: Simplify + // vector.multi_reduction --> vector.contract + // Generates a 6-dim vector.contract with the dim matching the original MMT4D Op + // and with the following split into parallel and reduction dims: + // * parallel, parallel, reduction, parallel, parallel, reduction + transform.apply_patterns to %func { + transform.apply_patterns.vector.reduction_to_contract + // Reduce the rank of xfer ops. This transforms vector.contract to be + // more matmul-like and to enable the lowering to outer product Ops. + transform.apply_patterns.vector.transfer_permutation_patterns + } : !transform.op<"func.func"> + + // Hoisting and LICM - not strictly required + %func_h = transform.structured.hoist_redundant_vector_transfers %func + : (!transform.op<"func.func">) -> !transform.op<"func.func"> + %all_loops = transform.structured.match interface{LoopLikeInterface} in %func_h + : (!transform.op<"func.func">) -> !transform.any_op + transform.apply_licm to %all_loops : !transform.any_op + transform.loop.hoist_loop_invariant_subsets %all_loops : !transform.any_op + + // Simplify the 6-dim vector.contract into a 3-dim matmul-like + // vector.contract with the following split into parallel and reduction + // dims: + // * parallel, parallel, reduction + transform.apply_patterns to %func_h { + transform.apply_patterns.vector.reduction_to_contract + transform.apply_patterns.vector.cast_away_vector_leading_one_dim + transform.apply_patterns.canonicalization + } : !transform.op<"func.func"> + transform.yield + } + + transform.named_sequence @match_mmt4d( + %entry: !transform.any_op {transform.readonly}) -> !transform.any_op { + transform.match.operation_name %entry ["linalg.mmt4d"] : !transform.any_op + transform.yield %entry : !transform.any_op + } +} + +func.func private @printMemrefI32(%ptr : tensor<*xi32>) From 7542f60b722d87fb64e911439cb7b64344a48763 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 20 Feb 2024 21:08:05 +0000 Subject: [PATCH 008/351] [gn build] Port c625b9965273 --- llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn index a813bf3f508b2..31f567d6df159 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn @@ -50,6 +50,7 @@ static_library("Orc") { "OrcABISupport.cpp", "OrcV2CBindings.cpp", "RTDyldObjectLinkingLayer.cpp", + "SectCreate.cpp", "SimpleRemoteEPC.cpp", "SpeculateAnalyses.cpp", "Speculation.cpp", From 1db2859dd972dfe1284564c43d55c30280f977d5 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:21:21 -0800 Subject: [PATCH 009/351] [flang] Handle more use cases reported for issues/78797 (#79628) I implemented legacy "token pasting" via line continuation for call prefix& &MACRO& &suffix(1) in a recent patch; this patch addresses the related cases call prefix& &MACRO& &(1) and call & &MACRO& &suffix(1) Fixes the latest https://github.com/llvm/llvm-project/issues/79590. --- flang/lib/Parser/prescan.cpp | 20 +++++++++++++++----- flang/test/Preprocessing/pp005.F | 6 +++--- flang/test/Preprocessing/pp006.F | 6 +++--- flang/test/Preprocessing/pp105.F90 | 6 +++--- flang/test/Preprocessing/pp106.F90 | 6 +++--- flang/test/Preprocessing/pp134.F90 | 18 ++++++++++++++++-- 6 files changed, 43 insertions(+), 19 deletions(-) diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index f7f22177a7d0b..e9b23172ed2e2 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -630,9 +630,11 @@ bool Prescanner::NextToken(TokenSequence &tokens) { preventHollerith_ = false; } else if (IsLegalInIdentifier(*at_)) { int parts{1}; + const char *afterLast{nullptr}; do { EmitChar(tokens, *at_); ++at_, ++column_; + afterLast = at_; if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) { tokens.CloseToken(); ++parts; @@ -640,12 +642,20 @@ bool Prescanner::NextToken(TokenSequence &tokens) { } while (IsLegalInIdentifier(*at_)); if (parts >= 3) { // Subtlety: When an identifier is split across three or more continuation - // lines, its parts are kept as distinct pp-tokens so that macro - // operates on them independently. This trick accommodates the historic - // practice of using line continuation for token pasting after - // replacement. + // lines (or two continuation lines, immediately preceded or followed + // by '&' free form continuation line markers, its parts are kept as + // distinct pp-tokens so that macro operates on them independently. + // This trick accommodates the historic practice of using line + // continuation for token pasting after replacement. } else if (parts == 2) { - tokens.ReopenLastToken(); + if ((start > start_ && start[-1] == '&') || + (afterLast < limit_ && (*afterLast == '&' || *afterLast == '\n'))) { + // call & call foo& call foo& + // &MACRO& OR &MACRO& OR &MACRO + // &foo(...) &(...) + } else { + tokens.ReopenLastToken(); + } } if (InFixedFormSource()) { SkipSpaces(); diff --git a/flang/test/Preprocessing/pp005.F b/flang/test/Preprocessing/pp005.F index e4483b404c367..a8d7394cb12d3 100644 --- a/flang/test/Preprocessing/pp005.F +++ b/flang/test/Preprocessing/pp005.F @@ -1,11 +1,11 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: res = 777 +! CHECK: res = (777) * KWM split across continuation, implicit padding integer, parameter :: KWM = 666 #define KWM 777 integer :: res - res = KW - +M + res = (KW + +M) if (res .eq. 777) then print *, 'pp005.F yes' else diff --git a/flang/test/Preprocessing/pp006.F b/flang/test/Preprocessing/pp006.F index f526ad31733ef..e45dcf9c18e19 100644 --- a/flang/test/Preprocessing/pp006.F +++ b/flang/test/Preprocessing/pp006.F @@ -1,12 +1,12 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: res = 777 +! CHECK: res = (777) * ditto, but with intervening *comment line integer, parameter :: KWM = 666 #define KWM 777 integer :: res - res = KW + res = (KW *comment - +M + +M) if (res .eq. 777) then print *, 'pp006.F yes' else diff --git a/flang/test/Preprocessing/pp105.F90 b/flang/test/Preprocessing/pp105.F90 index b4f73da6fa24c..e861e9688d2c5 100644 --- a/flang/test/Preprocessing/pp105.F90 +++ b/flang/test/Preprocessing/pp105.F90 @@ -1,11 +1,11 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: res = 777 +! CHECK: res = (777) ! KWM call name split across continuation, with leading & integer, parameter :: KWM = 666 #define KWM 777 integer :: res - res = KW& -&M + res = (KW& +&M) if (res .eq. 777) then print *, 'pp105.F90 yes' else diff --git a/flang/test/Preprocessing/pp106.F90 b/flang/test/Preprocessing/pp106.F90 index 556d779048f6c..a450807f0bd21 100644 --- a/flang/test/Preprocessing/pp106.F90 +++ b/flang/test/Preprocessing/pp106.F90 @@ -1,11 +1,11 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: res = 777 +! CHECK: res = (777) ! ditto, with & ! comment integer, parameter :: KWM = 666 #define KWM 777 integer :: res - res = KW& ! comment -&M + res = (KW& ! comment +&M) if (res .eq. 777) then print *, 'pp106.F90 yes' else diff --git a/flang/test/Preprocessing/pp134.F90 b/flang/test/Preprocessing/pp134.F90 index 01e7b010d426e..bc34767224fa0 100644 --- a/flang/test/Preprocessing/pp134.F90 +++ b/flang/test/Preprocessing/pp134.F90 @@ -1,9 +1,23 @@ ! RUN: %flang -E %s 2>&1 | FileCheck %s -! CHECK: print *, ADC +! CHECK: print *, ADC, 1 +! CHECK: print *, AD, 1 +! CHECK: print *, DC, 1 +! CHECK: print *, AD +! CHECK: print *, AB #define B D implicit none real ADC print *, A& &B& - &C + &C, 1 +print *, A& + &B& + &, 1 +print *, & + &B& + &C, 1 +print *, A& + &B +print *, A& + &B ! but not this end From 1219214a3bcc51022492928b8bb4ff4bdb75d0cb Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Tue, 20 Feb 2024 15:29:05 -0600 Subject: [PATCH 010/351] [Hexagon] Update InstrInfo to include LD/ST offsets of vector instructions (#82386) The hook HexagonInstrInfo::isValidOffset() is updated to evaluate offsets of missed LD/ST vector instructions. --- llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp | 28 +++++++++ .../CodeGen/Hexagon/ldst_vector_offset.ll | 59 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 6c7e88fbe2eb8..619c7dc69f9b2 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2765,12 +2765,40 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::PS_vloadrw_nt_ai: case Hexagon::V6_vL32b_ai: case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vS32b_pred_ai: + case Hexagon::V6_vS32b_npred_ai: case Hexagon::V6_vS32b_qpred_ai: case Hexagon::V6_vS32b_nqpred_ai: + case Hexagon::V6_vS32b_new_ai: + case Hexagon::V6_vS32b_new_pred_ai: + case Hexagon::V6_vS32b_new_npred_ai: + case Hexagon::V6_vS32b_nt_pred_ai: + case Hexagon::V6_vS32b_nt_npred_ai: + case Hexagon::V6_vS32b_nt_new_ai: + case Hexagon::V6_vS32b_nt_new_pred_ai: + case Hexagon::V6_vS32b_nt_new_npred_ai: + case Hexagon::V6_vS32b_nt_qpred_ai: + case Hexagon::V6_vS32b_nt_nqpred_ai: case Hexagon::V6_vL32b_nt_ai: case Hexagon::V6_vS32b_nt_ai: case Hexagon::V6_vL32Ub_ai: case Hexagon::V6_vS32Ub_ai: + case Hexagon::V6_vL32b_cur_ai: + case Hexagon::V6_vL32b_tmp_ai: + case Hexagon::V6_vL32b_pred_ai: + case Hexagon::V6_vL32b_npred_ai: + case Hexagon::V6_vL32b_cur_pred_ai: + case Hexagon::V6_vL32b_cur_npred_ai: + case Hexagon::V6_vL32b_tmp_pred_ai: + case Hexagon::V6_vL32b_tmp_npred_ai: + case Hexagon::V6_vL32b_nt_cur_ai: + case Hexagon::V6_vL32b_nt_tmp_ai: + case Hexagon::V6_vL32b_nt_pred_ai: + case Hexagon::V6_vL32b_nt_npred_ai: + case Hexagon::V6_vL32b_nt_cur_pred_ai: + case Hexagon::V6_vL32b_nt_cur_npred_ai: + case Hexagon::V6_vL32b_nt_tmp_pred_ai: + case Hexagon::V6_vL32b_nt_tmp_npred_ai: case Hexagon::V6_vgathermh_pseudo: case Hexagon::V6_vgathermw_pseudo: case Hexagon::V6_vgathermhw_pseudo: diff --git a/llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll b/llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll new file mode 100644 index 0000000000000..15695e8350165 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/ldst_vector_offset.ll @@ -0,0 +1,59 @@ +; REQUIRES: asserts +; RUN: llc -O3 -march=hexagon < %s -o /dev/null +; Make sure that this doesn't crash. +; This test validates that the compiler would not assert when analyzing the +; offset of V6_vS32b_pred_ai instruction + +%struct.pluto = type <{ ptr, i16, ptr }> + +@global = external hidden unnamed_addr constant [62 x i8], align 1 +@global.1 = external hidden unnamed_addr constant [47 x i8], align 1 +@global.2 = hidden local_unnamed_addr constant %struct.pluto <{ ptr @global, i16 892, ptr @global.1 }>, align 1 +@global.3 = local_unnamed_addr constant [1 x i32] zeroinitializer + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare <32 x i32> @llvm.hexagon.V6.vd0.128B() #1 + +; Function Attrs: noinline nounwind +declare hidden fastcc void @zot(i32, i32, i32, i32) unnamed_addr #2 + +; Function Attrs: noinline nounwind +define void @barney(ptr nocapture %arg, ptr nocapture readnone %arg1, i8 signext %arg2, i32 %arg3, ptr nocapture readnone %arg4, ptr nocapture readnone %arg5, i32 %arg6, i32 %arg7, ptr nocapture readnone %arg8, ptr nocapture readnone %arg9, ptr nocapture readnone %arg10, ptr nocapture readnone %arg11, ptr nocapture readnone %arg12, ptr nocapture readnone %arg13, ptr nocapture readnone %arg14, ptr nocapture readnone %arg15, ptr nocapture readnone %arg16, ptr nocapture readnone %arg17) local_unnamed_addr #2 { +bb: + %icmp = icmp ult i32 %arg3, 4 + tail call void @llvm.assume(i1 %icmp) #3 + %call = tail call <32 x i32> @llvm.hexagon.V6.vd0.128B() #3 + br label %bb18 + +bb18: ; preds = %bb22, %bb + %phi = phi i32 [ %and, %bb22 ], [ %arg3, %bb ] + %phi19 = phi i32 [ %add23, %bb22 ], [ 4, %bb ] + %icmp20 = icmp eq i32 %phi, 0 + br i1 %icmp20, label %bb21, label %bb22 + +bb21: ; preds = %bb18 + %shl = shl i32 %phi19, 8 + %getelementptr = getelementptr inbounds i8, ptr %arg, i32 %shl + %bitcast = bitcast ptr %getelementptr to ptr + store <32 x i32> %call, ptr %bitcast, align 128 + br label %bb22 + +bb22: ; preds = %bb21, %bb18 + %add = add nuw nsw i32 %phi, 1 + %and = and i32 %add, 3 + %add23 = add nuw nsw i32 %phi19, 1 + %icmp24 = icmp eq i32 %add23, 8 + br i1 %icmp24, label %bb25, label %bb18 + +bb25: ; preds = %bb22 + tail call fastcc void @zot(i32 %arg6, i32 %arg7, i32 0, i32 %arg3) + ret void +} + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { noinline nounwind "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-small-data" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #3 = { nounwind } From 96b17043507caec02a2ef440b369506122bdeb11 Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Tue, 20 Feb 2024 13:41:15 -0800 Subject: [PATCH 011/351] [flang][runtime] Don't write implied ENDFILE for REC=/POS= (#79637) An implied ENDFILE record, which truncates an external file, should be written to a sequential unit whenever the file is repositioned for a BACKSPACE or REWIND statement if a WRITE statement has executed since the last OPEN/BACKSPACE/REWIND. But the REC= and POS= positioning specifiers don't apply to sequential units (they're for direct and stream units, resp.), so don't truncate the file when they're used. --- flang/runtime/unit.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp index 18590567c65eb..58ca313d9e445 100644 --- a/flang/runtime/unit.cpp +++ b/flang/runtime/unit.cpp @@ -679,6 +679,7 @@ void ExternalFileUnit::Rewind(IoErrorHandler &handler) { handler.SignalError(IostatRewindNonSequential, "REWIND(UNIT=%d) on non-sequential file", unitNumber()); } else { + DoImpliedEndfile(handler); SetPosition(0, handler); currentRecordNumber = 1; leftTabLimit.reset(); @@ -687,7 +688,6 @@ void ExternalFileUnit::Rewind(IoErrorHandler &handler) { } void ExternalFileUnit::SetPosition(std::int64_t pos, IoErrorHandler &handler) { - DoImpliedEndfile(handler); frameOffsetInFile_ = pos; recordOffsetInFrame_ = 0; if (access == Access::Direct) { @@ -707,6 +707,12 @@ bool ExternalFileUnit::SetStreamPos( "POS=%zd is invalid", static_cast(oneBasedPos)); return false; } + // A backwards POS= implies truncation after writing, at least in + // Intel and NAG. + if (static_cast(oneBasedPos - 1) < + frameOffsetInFile_ + recordOffsetInFrame_) { + DoImpliedEndfile(handler); + } SetPosition(oneBasedPos - 1, handler); // We no longer know which record we're in. Set currentRecordNumber to // a large value from whence we can both advance and backspace. From 78762357d449cfcd11426c8e152302a27f2e7d4d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 20 Feb 2024 13:59:49 -0800 Subject: [PATCH 012/351] [ELF] Support placing .lbss/.lrodata/.ldata after .bss https://reviews.llvm.org/D150510 places .lrodata before .rodata to minimize the number of permission transitions in the memory image. However, this layout is less ideal for -fno-pic code (which is still important). Small code model -fno-pic code has R_X86_64_32S relocations with a range of `[0,2**31)` (if we ignore the negative area). Placing `.lrodata` earlier exerts relocation pressure on such code. Non-x86 64-bit architectures generally have a similar `[0,2**31)` limitation if they don't use PC-relative relocations. If we place .lrodata later, we will need one extra PT_LOAD. Two layouts are appealing: * .bss/.lbss/.lrodata/.ldata (GNU ld) * .bss/.ldata/.lbss/.lrodata The GNU ld layout has the nice property that there is only one BSS (except .tbss/.relro_padding). Add -z lrodata-after-bss to support this layout. Since a read-only PT_LOAD segment (for large data sections) may appear after RW PT_LOAD segments. The placement of `_etext` has to be adjusted. Pull Request: https://github.com/llvm/llvm-project/pull/81224 --- lld/ELF/Config.h | 1 + lld/ELF/Driver.cpp | 2 ++ lld/ELF/Writer.cpp | 47 ++++++++++++++++++---------- lld/docs/ld.lld.1 | 3 ++ lld/test/ELF/lto/codemodel.ll | 8 ++--- lld/test/ELF/x86-64-section-layout.s | 38 +++++++++++++++++++++- 6 files changed, 77 insertions(+), 22 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index fcca8c42b29b7..691ebfc074320 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -310,6 +310,7 @@ struct Config { bool zInitfirst; bool zInterpose; bool zKeepTextSectionPrefix; + bool zLrodataAfterBss; bool zNodefaultlib; bool zNodelete; bool zNodlopen; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 4bb9b7a0b2a98..24faa1753f1e3 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1436,6 +1436,8 @@ static void readConfigs(opt::InputArgList &args) { config->zInterpose = hasZOption(args, "interpose"); config->zKeepTextSectionPrefix = getZFlag( args, "keep-text-section-prefix", "nokeep-text-section-prefix", false); + config->zLrodataAfterBss = + getZFlag(args, "lrodata-after-bss", "nolrodata-after-bss", false); config->zNodefaultlib = hasZOption(args, "nodefaultlib"); config->zNodelete = hasZOption(args, "nodelete"); config->zNodlopen = hasZOption(args, "nodlopen"); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 5b7dfd358e764..0bbf43ddf694a 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -911,11 +911,12 @@ enum RankFlags { RF_NOT_ALLOC = 1 << 26, RF_PARTITION = 1 << 18, // Partition number (8 bits) RF_NOT_SPECIAL = 1 << 17, - RF_WRITE = 1 << 16, - RF_EXEC_WRITE = 1 << 15, - RF_EXEC = 1 << 14, - RF_RODATA = 1 << 13, - RF_LARGE = 1 << 12, + RF_LARGE_ALT = 1 << 15, + RF_WRITE = 1 << 14, + RF_EXEC_WRITE = 1 << 13, + RF_EXEC = 1 << 12, + RF_RODATA = 1 << 11, + RF_LARGE = 1 << 10, RF_NOT_RELRO = 1 << 9, RF_NOT_TLS = 1 << 8, RF_BSS = 1 << 7, @@ -974,8 +975,14 @@ static unsigned getSectionRank(OutputSection &osec) { if (osec.type == SHT_PROGBITS) rank |= RF_RODATA; // Among PROGBITS sections, place .lrodata further from .text. - if (!(osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64)) - rank |= RF_LARGE; + // For -z lrodata-after-bss, place .lrodata after .lbss like GNU ld. This + // layout has one extra PT_LOAD, but alleviates relocation overflow + // pressure for absolute relocations referencing small data from -fno-pic + // relocatable files. + if (osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64) + rank |= config->zLrodataAfterBss ? RF_LARGE_ALT : 0; + else + rank |= config->zLrodataAfterBss ? 0 : RF_LARGE; } else if (isExec) { rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC; } else { @@ -988,10 +995,15 @@ static unsigned getSectionRank(OutputSection &osec) { osec.relro = true; else rank |= RF_NOT_RELRO; - // Place .ldata and .lbss after .bss. Making .bss closer to .text alleviates - // relocation overflow pressure. - if (osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64) - rank |= RF_LARGE; + // Place .ldata and .lbss after .bss. Making .bss closer to .text + // alleviates relocation overflow pressure. + // For -z lrodata-after-bss, place .lbss/.lrodata/.ldata after .bss. + // .bss/.lbss being adjacent reuses the NOBITS size optimization. + if (osec.flags & SHF_X86_64_LARGE && config->emachine == EM_X86_64) { + rank |= config->zLrodataAfterBss + ? (osec.type == SHT_NOBITS ? 1 : RF_LARGE_ALT) + : RF_LARGE; + } } // Within TLS sections, or within other RelRo sections, or within non-RelRo @@ -1103,7 +1115,7 @@ template void Writer::setReservedSymbolSections() { } PhdrEntry *last = nullptr; - PhdrEntry *lastRO = nullptr; + OutputSection *lastRO = nullptr; auto isLarge = [](OutputSection *osec) { return config->emachine == EM_X86_64 && osec->flags & SHF_X86_64_LARGE; }; @@ -1112,17 +1124,18 @@ template void Writer::setReservedSymbolSections() { if (p->p_type != PT_LOAD) continue; last = p; - if (!(p->p_flags & PF_W)) - lastRO = p; + if (!(p->p_flags & PF_W) && p->lastSec && !isLarge(p->lastSec)) + lastRO = p->lastSec; } } if (lastRO) { - // _etext is the first location after the last read-only loadable segment. + // _etext is the first location after the last read-only loadable segment + // that does not contain large sections. if (ElfSym::etext1) - ElfSym::etext1->section = lastRO->lastSec; + ElfSym::etext1->section = lastRO; if (ElfSym::etext2) - ElfSym::etext2->section = lastRO->lastSec; + ElfSym::etext2->section = lastRO; } if (last) { diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index 12b17dd37796d..e4d39e47f5c5a 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -791,6 +791,9 @@ flag to indicate to the runtime linker that the object is an interposer. During symbol resolution interposers are searched after the application but before other dependencies. .Pp +.It Cm lrodata-after-bss +Place .lrodata after .bss. +.Pp .It Cm muldefs Do not error if a symbol is defined multiple times. The first definition will be used. diff --git a/lld/test/ELF/lto/codemodel.ll b/lld/test/ELF/lto/codemodel.ll index a35f87729411d..cf7d0e409ec4b 100644 --- a/lld/test/ELF/lto/codemodel.ll +++ b/lld/test/ELF/lto/codemodel.ll @@ -2,8 +2,8 @@ ; RUN: llvm-as %s -o %t.o ; RUN: ld.lld %t.o -o %ts -mllvm -code-model=small ; RUN: ld.lld %t.o -o %tl -mllvm -code-model=large -; RUN: llvm-objdump --no-print-imm-hex -d %ts | FileCheck %s --check-prefix=CHECK-SMALL -; RUN: llvm-objdump --no-print-imm-hex -d %tl | FileCheck %s --check-prefix=CHECK-LARGE +; RUN: llvm-objdump -d %ts | FileCheck %s --check-prefix=CHECK-SMALL +; RUN: llvm-objdump -d %tl | FileCheck %s --check-prefix=CHECK-LARGE target triple = "x86_64-unknown-linux-gnu" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -13,8 +13,8 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 define ptr @_start() nounwind readonly { entry: ; CHECK-SMALL-LABEL: <_start>: -; CHECK-SMALL: movl $2097440, %eax +; CHECK-SMALL: movl ${{.*}}, %eax ; CHECK-LARGE-LABEL: <_start>: -; CHECK-LARGE: movabsq $2097440, %rax +; CHECK-LARGE: movabsq ${{.*}}, %rax ret ptr @data } diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s index 0ba6053938939..b03d3e6c2b999 100644 --- a/lld/test/ELF/x86-64-section-layout.s +++ b/lld/test/ELF/x86-64-section-layout.s @@ -12,9 +12,12 @@ # RUN: ld.lld --section-start=.note=0x200300 a1.o -o a1 # RUN: llvm-readelf -S -sX a1 | FileCheck %s --check-prefix=CHECK1 -# RUN: ld.lld -T b.lds -z norelro a.o -o b +# RUN: ld.lld -T b.lds -z norelro a.o -z lrodata-after-bss -z nolrodata-after-bss -o b --fatal-warnings # RUN: llvm-readelf -S -l b | FileCheck %s --check-prefix=CHECK2 +# RUN: ld.lld --section-start=.note=0x200300 a.o -z lrodata-after-bss -o a3 +# RUN: llvm-readelf -S -l -sX a3 | FileCheck %s --check-prefix=CHECK3 + # CHECK: Name Type Address Off Size ES Flg Lk Inf Al # CHECK-NEXT: NULL 0000000000000000 000000 000000 00 0 0 0 # CHECK-NEXT: .note NOTE 0000000000200300 000300 000001 00 A 0 0 1 @@ -80,6 +83,39 @@ # CHECK2-NEXT: LOAD 0x000305 0x0000000000200305 0x0000000000200305 0x001805 0x002a06 RW 0x1000 # CHECK2-NEXT: TLS 0x000305 0x0000000000200305 0x0000000000200305 0x000001 0x000003 R 0x1 +# CHECK3: Name Type Address Off Size ES Flg Lk Inf Al +# CHECK3-NEXT: NULL 0000000000000000 000000 000000 00 0 0 0 +# CHECK3-NEXT: .note NOTE 0000000000200300 000300 000001 00 A 0 0 1 +# CHECK3-NEXT: .rodata PROGBITS 0000000000200301 000301 000001 00 A 0 0 1 +# CHECK3-NEXT: .text PROGBITS 0000000000201304 000304 000001 00 AX 0 0 4 +# CHECK3-NEXT: .tdata PROGBITS 0000000000202305 000305 000001 00 WAT 0 0 1 +# CHECK3-NEXT: .tbss NOBITS 0000000000202306 000306 000002 00 WAT 0 0 1 +# CHECK3-NEXT: .relro_padding NOBITS 0000000000202306 000306 000cfa 00 WA 0 0 1 +# CHECK3-NEXT: .data PROGBITS 0000000000203306 000306 000001 00 WA 0 0 1 +# CHECK3-NEXT: .bss NOBITS 0000000000203307 000307 001800 00 WA 0 0 1 +## We spend (size(.bss) + size(.lbss)) % MAXPAGESIZE bytes. +# CHECK3-NEXT: .lbss NOBITS 0000000000204b07 000307 001201 00 WAl 0 0 1 +# CHECK3-NEXT: .lrodata PROGBITS 0000000000206d08 000d08 000002 00 Al 0 0 1 +# CHECK3-NEXT: .ldata PROGBITS 0000000000207d0a 000d0a 000002 00 WAl 0 0 1 +# CHECK3-NEXT: .ldata2 PROGBITS 0000000000207d0c 000d0c 000001 00 WAl 0 0 1 +# CHECK3-NEXT: .comment PROGBITS 0000000000000000 000d0d {{.*}} 01 MS 0 0 1 + +# CHECK3: Program Headers: +# CHECK3-NEXT: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# CHECK3-NEXT: PHDR 0x000040 0x0000000000200040 0x0000000000200040 {{.*}} {{.*}} R 0x8 +# CHECK3-NEXT: LOAD 0x000000 0x0000000000200000 0x0000000000200000 0x000302 0x000302 R 0x1000 +# CHECK3-NEXT: LOAD 0x000304 0x0000000000201304 0x0000000000201304 0x000001 0x000001 R E 0x1000 +# CHECK3-NEXT: LOAD 0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000cfb RW 0x1000 +# CHECK3-NEXT: LOAD 0x000306 0x0000000000203306 0x0000000000203306 0x000001 0x002a02 RW 0x1000 +# CHECK3-NEXT: LOAD 0x000d08 0x0000000000206d08 0x0000000000206d08 0x000002 0x000002 R 0x1000 +# CHECK3-NEXT: LOAD 0x000d0a 0x0000000000207d0a 0x0000000000207d0a 0x000003 0x000003 RW 0x1000 +# CHECK3-NEXT: TLS 0x000305 0x0000000000202305 0x0000000000202305 0x000001 0x000003 R 0x1 + +# CHECK3: 0000000000201304 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _start +# CHECK3-NEXT: 0000000000201305 0 NOTYPE GLOBAL DEFAULT [[#]] (.text) _etext +# CHECK3-NEXT: 0000000000203307 0 NOTYPE GLOBAL DEFAULT [[#]] (.data) _edata +# CHECK3-NEXT: 0000000000207d0d 0 NOTYPE GLOBAL DEFAULT [[#]] (.ldata2) _end + #--- a.s .globl _start, _etext, _edata, _end _start: From 5a20a208037d32d52f0c626ea3b199278ff0df0a Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Tue, 20 Feb 2024 14:08:37 -0800 Subject: [PATCH 013/351] [flang] Resolve "possible performance problem" issue spam (#79769) Four "issues" on GitHub report possible performance problems, likely detected by static analysis. None of them would ever make a measureable difference in compilation time, but I'm resolving them to clean up the open issues list. Fixes https://github.com/llvm/llvm-project/issues/79703, .../79705, .../79706, & .../79707. --- flang/lib/Lower/OpenACC.cpp | 2 +- flang/lib/Optimizer/CodeGen/Target.cpp | 2 +- flang/lib/Parser/preprocessor.cpp | 2 +- flang/lib/Parser/preprocessor.h | 2 +- flang/lib/Semantics/check-directive-structure.h | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 446b1529ca008..151077d81ba14 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -3247,7 +3247,7 @@ static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder, fir::FirOpBuilder &builder, mlir::Location loc, fir::GlobalOp globalOp, mlir::acc::DataClause clause, - const std::string declareGlobalName, + const std::string &declareGlobalName, bool implicit, std::stringstream &asFortran) { GlobalOp declareGlobalOp = modBuilder.create(loc, declareGlobalName); diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp index 19730f7a64337..7c77bdd79008f 100644 --- a/flang/lib/Optimizer/CodeGen/Target.cpp +++ b/flang/lib/Optimizer/CodeGen/Target.cpp @@ -47,7 +47,7 @@ static const llvm::fltSemantics &floatToSemantics(const KindMapping &kindMap, } static void typeTodo(const llvm::fltSemantics *sem, mlir::Location loc, - std::string context) { + const std::string &context) { if (sem == &llvm::APFloat::IEEEhalf()) { TODO(loc, "COMPLEX(KIND=2): for " + context + " type"); } else if (sem == &llvm::APFloat::BFloat()) { diff --git a/flang/lib/Parser/preprocessor.cpp b/flang/lib/Parser/preprocessor.cpp index 4c2bd31a2ae84..515b8f62daf9a 100644 --- a/flang/lib/Parser/preprocessor.cpp +++ b/flang/lib/Parser/preprocessor.cpp @@ -252,7 +252,7 @@ void Preprocessor::DefineStandardMacros() { Define("__LINE__"s, "__LINE__"s); } -void Preprocessor::Define(std::string macro, std::string value) { +void Preprocessor::Define(const std::string ¯o, const std::string &value) { definitions_.emplace(SaveTokenAsName(macro), Definition{value, allSources_}); } diff --git a/flang/lib/Parser/preprocessor.h b/flang/lib/Parser/preprocessor.h index 3b456364944c3..b61f1577727be 100644 --- a/flang/lib/Parser/preprocessor.h +++ b/flang/lib/Parser/preprocessor.h @@ -70,7 +70,7 @@ class Preprocessor { AllSources &allSources() { return allSources_; } void DefineStandardMacros(); - void Define(std::string macro, std::string value); + void Define(const std::string ¯o, const std::string &value); void Undefine(std::string macro); bool IsNameDefined(const CharBlock &); bool IsFunctionLikeDefinition(const CharBlock &); diff --git a/flang/lib/Semantics/check-directive-structure.h b/flang/lib/Semantics/check-directive-structure.h index 829405f99d64c..97e13c59ac416 100644 --- a/flang/lib/Semantics/check-directive-structure.h +++ b/flang/lib/Semantics/check-directive-structure.h @@ -176,8 +176,8 @@ template class DirectiveStructureChecker : public virtual BaseChecker { protected: DirectiveStructureChecker(SemanticsContext &context, - std::unordered_map> - directiveClausesMap) + const std::unordered_map> + &directiveClausesMap) : context_{context}, directiveClausesMap_(directiveClausesMap) {} virtual ~DirectiveStructureChecker() {} From adf838daee63b3245c8822957988da5367e1572c Mon Sep 17 00:00:00 2001 From: "Balaji V. Iyer" <43187390+bviyer@users.noreply.github.com> Date: Tue, 20 Feb 2024 16:10:14 -0600 Subject: [PATCH 014/351] [mlir][Vectorizer] Added support to Vectorize tensor.unpack (#76087) Added support to vectorized tensor.unpack. The unpack Op is split into a `vector.transfer_read`, `vector.transpose`, `vector.shape_cast` and a `vector.transfer_write`. --- .../include/mlir/Dialect/Tensor/Utils/Utils.h | 12 +- .../TransformOps/LinalgTransformOps.cpp | 3 +- .../Dialect/Linalg/Transforms/Transforms.cpp | 2 +- .../Linalg/Transforms/Vectorization.cpp | 182 +++++++++++++++--- mlir/lib/Dialect/Tensor/Utils/Utils.cpp | 77 ++++++-- mlir/test/Dialect/Linalg/vectorization.mlir | 115 +++++++++++ 6 files changed, 337 insertions(+), 54 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h index fe9b16cb44b3d..d09c9e36f6ff8 100644 --- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h @@ -32,13 +32,11 @@ FailureOr computeTransposedType(RankedTensorType rankedTensorType, ArrayRef transposeVector); -/// Given a tensor::PackOp, compute the permutation vector to shuffle the -/// packed shape into the shape before any outer or inner permutations have -/// been applied. -/// i.e. for a pack from an ABCD layout to an ABCDba: -/// The packed shape would be ABCDba. -/// The pre-permutation shape would be AaBbCD. -SmallVector getPackInverseDestPermutation(PackOp packOp); +SmallVector getPackInverseDestPerm(tensor::PackOp packOp); +SmallVector getUnPackInverseSrcPerm(tensor::UnPackOp unpackOp); + +SmallVector getUnPackInverseSrcPerm(tensor::UnPackOp, + PackingMetadata &metadata); /// A tensor.insert_slice is a cast-like operation if it merely rank-extends the /// source tensor or inserts the source tensor into a destination tensor with diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 4ef8859fd5c43..299965bcfc3ab 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3152,7 +3152,8 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply( // TODO: Check that the correct number of vectorSizes was provided. for (Operation *target : targets) { - if (!isa(target)) { + if (!isa( + target)) { return mlir::emitSilenceableFailure(target->getLoc()) << "Unsupported Op, cannot vectorize"; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 01b393644679c..a17bc8e4cd318 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -237,7 +237,7 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, PackingMetadata packingMetadata = computePackingMetadata( packedTensorType.getRank(), packOp.getInnerDimsPos()); SmallVector packedToStripMinedShapePerm = - tensor::getPackInverseDestPermutation(packOp); + tensor::getPackInverseDestPerm(packOp); // 3. Compute the stripMinedShape: this is the packed shape before any outer // or inner permutations have been applied. diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 2bd6929fea614..ac043e87223df 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1405,8 +1405,7 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state, /// permutations. static SmallVector getTiledPackShape(tensor::PackOp packOp, ArrayRef destShape) { - return applyPermutation(destShape, - tensor::getPackInverseDestPermutation(packOp)); + return applyPermutation(destShape, tensor::getPackInverseDestPerm(packOp)); } /// Create a TransferReadOp from `source` with static shape `readShape`. If the @@ -1547,7 +1546,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, // Create TransposeOp. auto destPermutation = - invertPermutationVector(tensor::getPackInverseDestPermutation(packOp)); + invertPermutationVector(tensor::getPackInverseDestPerm(packOp)); auto transposeOp = rewriter.create( loc, shapeCastOp.getResult(), destPermutation); @@ -1559,6 +1558,112 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, return success(); } +/// Vectorize a `tensor::UnPackOp` to these 4 Ops: +/// Vector::TransferReadOp - Reads a vector from the source tensor +/// vector::TransposeOp - Transpose the Source tensor +/// ShapeCastOp - Reshape the data based on the target. +/// vector::TransferWriteOp. - Write the result vector back to the destination +/// tensor +static LogicalResult +vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, + ArrayRef inputVectorSizes, + SmallVectorImpl &newResults) { + + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(unpackOp); + + RankedTensorType unpackTensorType = unpackOp.getSourceType(); + + ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); + ArrayRef innerTiles = unpackOp.getStaticInnerTiles(); + + SmallVector readMaskShape(inputVectorSizes.begin(), + inputVectorSizes.end()); + ArrayRef outerDimsPerm = unpackOp.getOuterDimsPerm(); + ArrayRef sourceShape = unpackTensorType.getShape(); + + // ReadMask is the size of tensor used to read and apply mask. It is + // set like this: Let's say the vectorSize (VS) array is size 'N' and + // the sourceShape(SS) is 'M' where M >= N and InnerTileSizes (IT) of + // size M-N + // Thus: + // - initially: ReadMaskShape = vectorInputSizes + // - Divide all the readMaskShape locations pointed by innerDimPos + // by the innerTileSize attribute value. + // - if outer_dims_perms is present: do that permutation on readMaskShape. + // - Append the remaining shape from SS + // E.g. let's say let's say unpackTensorType.getShape() = <8x8x32x16> + // inner Dim Pos = [0, 1] and Inner Tiles = [32, 16], vector_sizes are [512, + // 128] and outer_dims_perm is [1, 0] then read shape is: + // ReadMaskShape(initial): [512, 128] + // Final Value(after innerDim Adjustment): [512/32, 128/16] + // = [16, 8] + // After applying outer_dims_perm: [8, 16] + // After appending the rest of the sourceShape: [8, 16, 32, 16] + + for (auto [index, size] : enumerate(innerTiles)) { + readMaskShape[innerDimPos[index]] = + llvm::divideCeil(readMaskShape[innerDimPos[index]], size); + } + if (!outerDimsPerm.empty()) { + applyPermutationToVector(readMaskShape, outerDimsPerm); + } + readMaskShape.append(sourceShape.begin() + inputVectorSizes.size(), + sourceShape.end()); + + ReifiedRankedShapedTypeDims reifiedRetShapes; + LogicalResult status = + cast(unpackOp.getOperation()) + .reifyResultShapes(rewriter, reifiedRetShapes); + if (status.failed()) { + LDBG("Unable to reify result shapes of " << unpackOp); + return failure(); + } + Location loc = unpackOp->getLoc(); + + auto padValue = rewriter.create( + loc, rewriter.getZeroAttr(unpackOp.getSourceType().getElementType())); + + // Read result, mask if necessary. If transferReadOp shape is not equal + // to shape of source, then a mask is necessary. + Value readResult = createReadOrMaskedRead( + rewriter, loc, unpackOp.getSource(), + ArrayRef(readMaskShape.begin(), readMaskShape.end()), padValue); + + PackingMetadata packMetadata; + SmallVector lastDimToInsertPosPerm = + tensor::getUnPackInverseSrcPerm(unpackOp, packMetadata); + ShapedType maskedOpShapedType = cast(readResult.getType()); + SmallVector stripMineShape(maskedOpShapedType.getShape()); + mlir::Type stripMineElemType = maskedOpShapedType.getElementType(); + applyPermutationToVector(stripMineShape, lastDimToInsertPosPerm); + RankedTensorType stripMineTensorType = + RankedTensorType::get(stripMineShape, stripMineElemType); + // Transpose the appropriate rows to match output. + vector::TransposeOp transposeOp = rewriter.create( + loc, readResult, lastDimToInsertPosPerm); + + // Collapse the vector to the size required by result. + RankedTensorType collapsedType = tensor::CollapseShapeOp::inferCollapsedType( + stripMineTensorType, packMetadata.reassociations); + mlir::VectorType vecCollapsedType = + VectorType::get(collapsedType.getShape(), collapsedType.getElementType()); + vector::ShapeCastOp shapeCastOp = rewriter.create( + loc, vecCollapsedType, transposeOp->getResult(0)); + + // WriteMaskShape had to match the shapecast shape for dynamic sizes, + // otherwise the validator complains that the mask size is invalid. + SmallVector writeMaskShape( + unpackOp.getDestType().hasStaticShape() + ? inputVectorSizes + : shapeCastOp.getResultVectorType().getShape()); + Operation *write = + createWriteOrMaskedWrite(rewriter, loc, shapeCastOp.getResult(), + reifiedRetShapes[0], writeMaskShape); + newResults.push_back(write->getResult(0)); + return success(); +} + /// Vectorize a `padOp` with (1) static result type, (2) constant padding value /// and (3) all-zero lowPad to /// `transfer_write_in_bounds(transfer_read_masked(pad_source, pad_value))`. @@ -1655,6 +1760,25 @@ isValidMaskedInputVector(ArrayRef shape, return success(); } +/// Need to check if the inner-tiles are static/constant. +static LogicalResult +vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp, + ArrayRef inputVectorSizes) { + + if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) { + return !getConstantIntValue(res).has_value(); + })) { + LDBG("Inner-tiles must be constant: " << unpackOp << "\n"); + return failure(); + } + llvm::ArrayRef resultShape = unpackOp.getDestType().getShape(); + if (!inputVectorSizes.empty() && + failed(isValidMaskedInputVector(resultShape, inputVectorSizes))) + return failure(); + + return success(); +} + static LogicalResult vectorizeLinalgOpPrecondition(LinalgOp linalgOp, ArrayRef inputVectorSizes, @@ -1703,9 +1827,10 @@ vectorizeLinalgOpPrecondition(LinalgOp linalgOp, } if (isElementwise(linalgOp)) return success(); - // TODO: isaConvolutionOpInterface that can also infer from generic features. - // But we will still need stride/dilation attributes that will be annoying to - // reverse-engineer... + + // TODO: isaConvolutionOpInterface that can also infer from generic + // features. But we will still need stride/dilation attributes that will be + // annoying to reverse-engineer... if (isa(linalgOp.getOperation())) return success(); // TODO: the common vector shape is equal to the static loop sizes only when @@ -1810,6 +1935,9 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition( .Case([&](auto packOp) { return vectorizePackOpPrecondition(packOp, inputVectorSizes); }) + .Case([&](auto unpackOp) { + return vectorizeUnPackOpPrecondition(unpackOp, inputVectorSizes); + }) .Default([](auto) { return failure(); }); } @@ -1829,11 +1957,11 @@ static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) { } /// Emit a suitable vector form for an operation. If provided, -/// `inputVectorSizes` are used to vectorize this operation. `inputVectorSizes` -/// must match the rank of the iteration space of the operation and the input -/// vector sizes must be greater than or equal to their counterpart iteration -/// space sizes, if static. `inputVectorShapes` also allows the vectorization of -/// operations with dynamic shapes. +/// `inputVectorSizes` are used to vectorize this operation. +/// `inputVectorSizes` must match the rank of the iteration space of the +/// operation and the input vector sizes must be greater than or equal to +/// their counterpart iteration space sizes, if static. `inputVectorShapes` +/// also allows the vectorization of operations with dynamic shapes. LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, ArrayRef inputVectorSizes, ArrayRef inputScalableVecDims, @@ -1867,8 +1995,9 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, auto vectorizeResult = TypeSwitch(op) .Case([&](auto linalgOp) { - // TODO: isaConvolutionOpInterface that can also infer from generic - // features. Will require stride/dilation attributes inference. + // TODO: isaConvolutionOpInterface that can also infer from + // generic features. Will require stride/dilation attributes + // inference. if (isa(linalgOp.getOperation())) { FailureOr convOr = vectorizeConvolution( rewriter, linalgOp, flatten1DDepthwiseConv); @@ -1902,6 +2031,10 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, return vectorizeAsTensorPackOp(rewriter, packOp, inputVectorSizes, results); }) + .Case([&](auto unpackOp) { + return vectorizeAsTensorUnpackOp(rewriter, unpackOp, + inputVectorSizes, results); + }) .Default([](auto) { return failure(); }); if (failed(vectorizeResult)) { @@ -1919,7 +2052,6 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, LogicalResult mlir::linalg::vectorizeCopy(RewriterBase &rewriter, memref::CopyOp copyOp) { - auto srcType = cast(copyOp.getSource().getType()); auto dstType = cast(copyOp.getTarget().getType()); if (!srcType.hasStaticShape() || !dstType.hasStaticShape()) @@ -2833,8 +2965,8 @@ struct Conv1DGenerator Value res = rewriter.create(loc, resType, resShaped, resPadding); - // The base vectorization case for channeled convolution is input: {n,w,c}, - // weight: {kw,c,f}, output: {n,w,f}. To reuse the base pattern + // The base vectorization case for channeled convolution is input: + // {n,w,c}, weight: {kw,c,f}, output: {n,w,f}. To reuse the base pattern // vectorization case, we do pre transpose on input, weight, and output. switch (conv1DOpOrder) { case Conv1DOpOrder::W: @@ -2877,9 +3009,9 @@ struct Conv1DGenerator return kw * (wSize / wSizeStep) + w; }; - // Compute contraction: O{n, w, f} += I{n, sw * w + dw * kw, c} * F{c, f} or - // perform outerproduct for non-channeled convolution or - // perform simple arith operation for pooling + // Compute contraction: O{n, w, f} += I{n, sw * w + dw * kw, c} * F{c, f} + // or perform outerproduct for non-channeled convolution or perform simple + // arith operation for pooling for (int64_t kw = 0; kw < kwSize; ++kw) { for (int64_t w = 0; w < wSize; w += wSizeStep) { switch (oper) { @@ -2908,9 +3040,9 @@ struct Conv1DGenerator // End vector-only rewrite part //===------------------------------------------------------------------===// - // The base vectorization case for channeled convolution is output: {n,w,f} - // To reuse the result from base pattern vectorization case, we post - // transpose the base case result. + // The base vectorization case for channeled convolution is output: + // {n,w,f} To reuse the result from base pattern vectorization case, we + // post transpose the base case result. switch (conv1DOpOrder) { case Conv1DOpOrder::W: case Conv1DOpOrder::Nwc: @@ -3348,9 +3480,9 @@ static FailureOr vectorizeConvolution(RewriterBase &rewriter, LinalgOp op, bool flatten1DDepthwiseConv) { // The ConvolutionOpInterface gives us guarantees of existence for - // strides/dilations. However, we do not need to rely on those, we can simply - // use them if present, otherwise use the default and let the generic conv. - // matcher in the ConvGenerator succeed or fail. + // strides/dilations. However, we do not need to rely on those, we can + // simply use them if present, otherwise use the default and let the generic + // conv. matcher in the ConvGenerator succeed or fail. auto strides = op->getAttrOfType("strides"); auto dilations = op->getAttrOfType("dilations"); auto stride = strides ? *strides.getValues().begin() : 1; diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp index f20008a1ed2b2..186f85d2ce20a 100644 --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -72,36 +72,73 @@ mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType, RTTBuilder(rankedTensorType).setShape(transposedShape); return transposedTensorType; } - -SmallVector -mlir::tensor::getPackInverseDestPermutation(PackOp packOp) { - // The permutation can be obtained from two permutations: - // a) Compute the permutation vector to move the last `numPackedDims` into - // the `innerPosDims` of a shape of rank `packedRank`. - // b) Compute the permutation vector to move outer dims if the pack op - // has outer_dims_perm. - // Apply (b) permutation on (a) permutation to get the final permutation. - int64_t numPackedDims = packOp.getInnerDimsPos().size(); - int64_t packedRank = packOp.getDestType().getRank(); - auto lastDims = llvm::to_vector( - llvm::seq(packedRank - numPackedDims, packedRank)); - PackingMetadata packingMetadata = computePackingMetadata( - packOp.getDestType().getRank(), packOp.getInnerDimsPos()); - SmallVector innerPositionsPerm = computePermutationVector( - packedRank, lastDims, packingMetadata.insertPositions); +/// The permutation can be obtained from two permutations: +/// a) Compute the permutation vector to move the last `numPackedDims` into +/// the `innerPosDims` of a shape of rank `rank`. +/// b) Compute the permutation vector to move outer dims if the +/// `outerPerm` parameter is not empty. +/// Apply (b) permutation on (a) permutation to get the final permutation. +static SmallVector +computePackUnPackPerm(int64_t rank, ArrayRef &innerDimsPos, + ArrayRef &outerPerm, + PackingMetadata &packingMetadata) { + int64_t numPackedDims = innerDimsPos.size(); + auto lastDims = + llvm::to_vector(llvm::seq(rank - numPackedDims, rank)); + packingMetadata = computePackingMetadata(rank, innerDimsPos); + SmallVector innerPositionsPerm = + computePermutationVector(rank, lastDims, packingMetadata.insertPositions); SmallVector outerPos = packingMetadata.outerPositions; - ArrayRef outerPerm = packOp.getOuterDimsPerm(); if (!outerPerm.empty()) applyPermutationToVector(outerPos, outerPerm); - SmallVector outerPositionPerm = computePermutationVector( - packedRank, packingMetadata.outerPositions, outerPos); + SmallVector outerPositionPerm = + computePermutationVector(rank, packingMetadata.outerPositions, outerPos); SmallVector packInverseDestPermutation = innerPositionsPerm; applyPermutationToVector(packInverseDestPermutation, outerPositionPerm); return packInverseDestPermutation; } +/// Shell function to compute the Destination Permutation of PackOp +/// This function uses the helper function `computePackUnPackPerm` to get +/// the permutation vector. Only major difference between UnPack and Pack is +/// that packOp uses destination rank whereas unpack Uses source rank. +SmallVector mlir::tensor::getPackInverseDestPerm(PackOp packOp) { + + PackingMetadata pMetadata; + int64_t packedRank = packOp.getDestType().getRank(); + ArrayRef innerDimPos = packOp.getInnerDimsPos(); + ArrayRef outerPerm = packOp.getOuterDimsPerm(); + SmallVector packInvDestPerm = + computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata); + return packInvDestPerm; +} + +/// Shell function to compute the Source Permutation of unPackOp. +/// This function, like the getPackInverseDestPerm uses the helper function +/// computePackUnPackPerm` to get the permutation vector. +/// Only major difference between UnPack and Pack is that packOp uses +/// destination rank whereas unpack Uses source rank. +SmallVector mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp) { + PackingMetadata metadata; + return mlir::tensor::getUnPackInverseSrcPerm(unpackOp, metadata); +} + +/// Shell function to compute the Source rank permutation for unpackOp +/// Unpack requires some packing metadata data information, so created +/// another function where this value is passed by reference. +SmallVector +mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp, + PackingMetadata &metadata) { + int64_t unpackRank = unpackOp.getSourceType().getRank(); + ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); + ArrayRef outerPerm = unpackOp.getOuterDimsPerm(); + SmallVector unpackInvSrcPerm = + computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata); + return unpackInvSrcPerm; +} + bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) { llvm::SmallBitVector droppedDims = op.getDroppedDims(); int64_t srcDim = 0; diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index 0272ac599aa3d..2d01d57304013 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -697,3 +697,118 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack +func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor, %arg1: tensor) -> tensor { +// CHECK: %[[C0:.*]] = arith.constant 0 +// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[DIM0:.*]] = tensor.dim %arg0, %[[C1]] : tensor +// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 +// CHECK: %[[C01:.*]] = arith.constant 0 +// CHECK: %[[C02:.*]] = arith.constant 0 +// CHECK: %[[DIM4:.*]] = tensor.dim %arg1, %[[C02]] : tensor +// CHECK: %[[CNST14:.*]] = arith.constant 1 +// CHECK: %[[DIM6:.*]] = tensor.dim %arg1, %[[CNST14]] : tensor +// CHECK: %[[CNST16:.*]] = arith.constant 16 : index +// CHECK: %[[CNST2:.*]] = arith.constant 2 : index +// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1> +// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32> +// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32> +// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32> +// CHECK: %[[empt0:.*]] = tensor.empty +// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> +// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]] +// CHECK: return %[[write0]] + %ret = tensor.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor -> tensor + return %ret : tensor +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_unpack +func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 + // CHECK: %[[C0:.*]]= arith.constant 0 : index + // CHECK: %[[C8:.*]] = arith.constant 8 : index + // CHECK: %[[C80:.*]] = arith.constant 8 : index + // CHECK: %[[C32:.*]] = arith.constant 32 : index + // CHECK: %[[C16:.*]] = arith.constant 16 : index + // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1> + // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32> + // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32> + // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32> + // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> + // CHECK: %[[C01:.*]] = arith.constant 0 : index + // CHECK: %[[C256:.*]] = arith.constant 256 : index + // CHECK: %[[C128:.*]] = arith.constant 128 : index + // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1> + // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32> + // CHECK: return %[[WRIT]] : tensor<256x128xf32> + %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_unpack_no_masks +func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> + // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> + // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> + // CHECK: %[[C00:.*]] = arith.constant 0 : index + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> + // CHECK: return %[[WRIT]] : tensor<256x128xf32> + %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op + transform.yield + } + } + + // ----- + + // CHECK-LABEL: test_vectorize_unpack_with_outer_perm + func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> + // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> + // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> + // CHECK: %[[C00:.*]] = arith.constant 0 : index + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> + // CHECK: return %[[WRIT]] : tensor<256x128xf32> + %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> + } + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op + transform.yield + } +} From 18f0da5b9bbe8ebf63eb17bfa5deff94bd602f64 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 13:32:20 -0700 Subject: [PATCH 015/351] Apply clang-tidy fixes for llvm-prefer-isa-or-dyn-cast-in-conditionals in OpFormatGen.cpp (NFC) --- mlir/tools/mlir-tblgen/OpFormatGen.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 31ceb05ad1dbf..eb8c0aba1d33b 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -1414,7 +1414,7 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body, } body.unindent() << "}\n"; body.unindent(); - } else if (dyn_cast(element)) { + } else if (isa(element)) { body << " if (parseProperties(parser, result))\n" << " return ::mlir::failure();\n"; } else if (auto *customDir = dyn_cast(element)) { @@ -2239,7 +2239,7 @@ void OperationFormat::genElementPrinter(FormatElement *element, } // Emit the attribute dictionary. - if (dyn_cast(element)) { + if (isa(element)) { genPropDictPrinter(*this, op, body); lastWasPunctuation = false; return; From e13bbd1e71797c781a0b242a7c121e5c5c620bc2 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 13:37:14 -0700 Subject: [PATCH 016/351] Apply clang-tidy fixes for modernize-use-emplace in RewriterGen.cpp (NFC) --- mlir/tools/mlir-tblgen/RewriterGen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp index 77c34cb03e987..426a3482960be 100644 --- a/mlir/tools/mlir-tblgen/RewriterGen.cpp +++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp @@ -1785,7 +1785,7 @@ void PatternEmitter::createAggregateLocalVarsForOpArgs( range); sizes.push_back(formatv("static_cast({0}.size())", range)); } else { - sizes.push_back("1"); + sizes.emplace_back("1"); os << formatv("tblgen_values.push_back("); if (node.isNestedDagArg(argIndex)) { os << symbolInfoMap.getValueAndRangeUse( From dd5696cdefaff2b3ee9c4de3053e31630863588c Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 13:49:20 -0700 Subject: [PATCH 017/351] Apply clang-tidy fixes for readability-identifier-naming in BytecodeTest.cpp (NFC) --- mlir/unittests/Bytecode/BytecodeTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp index bb7241c2d5196..a37a2afc22645 100644 --- a/mlir/unittests/Bytecode/BytecodeTest.cpp +++ b/mlir/unittests/Bytecode/BytecodeTest.cpp @@ -23,7 +23,7 @@ using namespace llvm; using namespace mlir; -StringLiteral IRWithResources = R"( +StringLiteral irWithResources = R"( module @TestDialectResources attributes { bytecode.test = dense_resource : tensor<4xi32> } {} @@ -42,7 +42,7 @@ TEST(Bytecode, MultiModuleWithResource) { Builder builder(&context); ParserConfig parseConfig(&context); OwningOpRef module = - parseSourceString(IRWithResources, parseConfig); + parseSourceString(irWithResources, parseConfig); ASSERT_TRUE(module); // Write the module to bytecode @@ -53,15 +53,15 @@ TEST(Bytecode, MultiModuleWithResource) { // Create copy of buffer which is aligned to requested resource alignment. constexpr size_t kAlignment = 0x20; - size_t buffer_size = buffer.size(); - buffer.reserve(buffer_size + kAlignment - 1); + size_t bufferSize = buffer.size(); + buffer.reserve(bufferSize + kAlignment - 1); size_t pad = ~(uintptr_t)buffer.data() + 1 & kAlignment - 1; buffer.insert(0, pad, ' '); - StringRef aligned_buffer(buffer.data() + pad, buffer_size); + StringRef alignedBuffer(buffer.data() + pad, bufferSize); // Parse it back OwningOpRef roundTripModule = - parseSourceString(aligned_buffer, parseConfig); + parseSourceString(alignedBuffer, parseConfig); ASSERT_TRUE(roundTripModule); // FIXME: Parsing external resources does not work on big-endian From 1893a3743eb971f0ea7657dc119b642a12870a1e Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 13:52:17 -0700 Subject: [PATCH 018/351] Apply clang-tidy fixes for performance-unnecessary-value-param in FileLineColLocBreakpointManagerTest.cpp (NFC) --- mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp index 48c62ad20a04a..5b48e80749c8b 100644 --- a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp +++ b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp @@ -98,7 +98,7 @@ TEST(FileLineColLocBreakpointManager, OperationMatch) { // Set a breakpoint matching only the second operation in the list. auto *breakpoint = breakpointManager.addBreakpoint( fileNames[0], lineColLoc[0].first, lineColLoc[0].second); - auto checkMatchIdxs = [&](DenseSet idxs) { + auto checkMatchIdxs = [&](const DenseSet &idxs) { counter = 0; int reference = 0; for (int i = 0; i < (int)operations.size(); ++i) { From 31f45596737f37e16226c039ff6f53406174b9d5 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 13:58:56 -0700 Subject: [PATCH 019/351] Apply clang-tidy fixes for performance-unnecessary-value-param in SerializationTest.cpp (NFC) --- mlir/unittests/Dialect/SPIRV/SerializationTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp index 56a98cc205ab4..3a6bcbd999a57 100644 --- a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp +++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp @@ -77,7 +77,7 @@ class SerializationTest : public ::testing::Test { } // Inserts an Integer or a Vector of Integers constant of value 'val'. - spirv::ConstantOp AddConstInt(Type type, APInt val) { + spirv::ConstantOp AddConstInt(Type type, const APInt &val) { OpBuilder builder(module->getRegion()); auto loc = UnknownLoc::get(&context); From 563ef306017a47d387f1c36dd562b172c1ad0626 Mon Sep 17 00:00:00 2001 From: jimingham Date: Tue, 20 Feb 2024 14:18:03 -0800 Subject: [PATCH 020/351] Add the RegisterCompleter to eArgTypeRegisterName in g_argument_table (#82428) This is a follow-on to: https://github.com/llvm/llvm-project/pull/82085 The completer for register names was missing from the argument table. I somehow missed that the only register completer test was x86_64, so that test broke. I added the completer in to the right slot in the argument table, and added a small completions test that just uses the alias register names. If we end up having a platform that doesn't define register names, we'll have to skip this test there, but it should add a sniff test for register completion that will run most everywhere. --- lldb/include/lldb/Interpreter/CommandObject.h | 9 ++- .../Interpreter/CommandOptionArgumentTable.h | 23 +++--- lldb/include/lldb/lldb-enumerations.h | 3 + .../source/Commands/CommandObjectCommands.cpp | 14 ---- .../Commands/CommandObjectDWIMPrint.cpp | 6 -- lldb/source/Commands/CommandObjectDWIMPrint.h | 4 - lldb/source/Commands/CommandObjectFrame.cpp | 19 ----- .../source/Commands/CommandObjectPlatform.cpp | 75 +++++-------------- lldb/source/Commands/CommandObjectPlugin.cpp | 7 -- lldb/source/Commands/CommandObjectProcess.cpp | 19 +---- .../source/Commands/CommandObjectRegister.cpp | 7 +- lldb/source/Commands/CommandObjectSession.cpp | 7 -- .../source/Commands/CommandObjectSettings.cpp | 8 -- lldb/source/Commands/CommandObjectTarget.cpp | 29 +------ lldb/source/Commands/CommandObjectThread.cpp | 13 +--- lldb/source/Commands/CommandObjectType.cpp | 32 -------- .../Commands/CommandObjectWatchpoint.cpp | 10 --- lldb/source/Interpreter/CommandObject.cpp | 37 +++++++++ lldb/test/API/commands/help/TestHelp.py | 2 +- .../completion/TestCompletion.py | 21 ++++++ 20 files changed, 108 insertions(+), 237 deletions(-) diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h index b99de56f53446..a326c6dc38a37 100644 --- a/lldb/include/lldb/Interpreter/CommandObject.h +++ b/lldb/include/lldb/Interpreter/CommandObject.h @@ -242,6 +242,13 @@ class CommandObject : public std::enable_shared_from_this { /// The completion request that needs to be answered. virtual void HandleCompletion(CompletionRequest &request); + /// The default version handles argument definitions that have only one + /// argument type, and use one of the argument types that have an entry in + /// the CommonCompletions. Override this if you have a more complex + /// argument setup. + /// FIXME: we should be able to extend this to more complex argument + /// definitions provided we have completers for all the argument types. + /// /// The input array contains a parsed version of the line. /// /// We've constructed the map of options and their arguments as well if that @@ -251,7 +258,7 @@ class CommandObject : public std::enable_shared_from_this { /// The completion request that needs to be answered. virtual void HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) {} + OptionElementVector &opt_element_vector); bool HelpTextContainsWord(llvm::StringRef search_word, bool search_short_help = true, diff --git a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h index d0cf54c31ca73..9248e2ac81446 100644 --- a/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h +++ b/lldb/include/lldb/Interpreter/CommandOptionArgumentTable.h @@ -243,7 +243,7 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = { { lldb::eArgTypeLogCategory, "log-category", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a category within a log channel, e.g. all (try \"log list\" to see a list of all channels and their categories." }, { lldb::eArgTypeLogChannel, "log-channel", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a log channel, e.g. process.gdb-remote (try \"log list\" to see a list of all channels and their categories)." }, { lldb::eArgTypeMethod, "method", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A C++ method name." }, - { lldb::eArgTypeName, "name", lldb::eTypeCategoryNameCompletion, {}, { nullptr, false }, "Help text goes here." }, + { lldb::eArgTypeName, "name", lldb::eTypeCategoryNameCompletion, {}, { nullptr, false }, "The name of a type category." }, { lldb::eArgTypeNewPathPrefix, "new-path-prefix", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Help text goes here." }, { lldb::eArgTypeNumLines, "num-lines", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The number of lines to use." }, { lldb::eArgTypeNumberPerLine, "number-per-line", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The number of items per line to display." }, @@ -260,9 +260,9 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = { { lldb::eArgTypePythonFunction, "python-function", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a Python function." }, { lldb::eArgTypePythonScript, "python-script", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Source code written in Python." }, { lldb::eArgTypeQueueName, "queue-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of the thread queue." }, - { lldb::eArgTypeRegisterName, "register-name", lldb::CompletionType::eNoCompletion, {}, { RegisterNameHelpTextCallback, true }, nullptr }, + { lldb::eArgTypeRegisterName, "register-name", lldb::CompletionType::eRegisterCompletion, {}, { RegisterNameHelpTextCallback, true }, nullptr }, { lldb::eArgTypeRegularExpression, "regular-expression", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A POSIX-compliant extended regular expression." }, - { lldb::eArgTypeRunArgs, "run-args", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Arguments to be passed to the target program when it starts executing." }, + { lldb::eArgTypeRunArgs, "run-args", lldb::CompletionType::eDiskFileCompletion, {}, { nullptr, false }, "Arguments to be passed to the target program when it starts executing." }, { lldb::eArgTypeRunMode, "run-mode", lldb::CompletionType::eNoCompletion, g_running_mode, { nullptr, false }, "Help text goes here." }, { lldb::eArgTypeScriptedCommandSynchronicity, "script-cmd-synchronicity", lldb::CompletionType::eNoCompletion, g_script_synchro_type, { nullptr, false }, "The synchronicity to use to run scripted commands with regard to LLDB event system." }, { lldb::eArgTypeScriptLang, "script-language", lldb::CompletionType::eNoCompletion, g_script_option_enumeration, { nullptr, false }, "The scripting language to be used for script-based commands. Supported languages are python and lua." }, @@ -270,21 +270,21 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = { { lldb::eArgTypeSelector, "selector", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "An Objective-C selector name." }, { lldb::eArgTypeSettingIndex, "setting-index", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "An index into a settings variable that is an array (try 'settings list' to see all the possible settings variables and their types)." }, { lldb::eArgTypeSettingKey, "setting-key", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A key into a settings variables that is a dictionary (try 'settings list' to see all the possible settings variables and their types)." }, - { lldb::eArgTypeSettingPrefix, "setting-prefix", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable up to a dot ('.'), e.g. 'target.process.'" }, - { lldb::eArgTypeSettingVariableName, "setting-variable-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable. Type 'settings list' to see a complete list of such variables." }, - { lldb::eArgTypeShlibName, "shlib-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The name of a shared library." }, + { lldb::eArgTypeSettingPrefix, "setting-prefix", lldb::CompletionType::eSettingsNameCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable up to a dot ('.'), e.g. 'target.process.'" }, + { lldb::eArgTypeSettingVariableName, "setting-variable-name", lldb::CompletionType::eSettingsNameCompletion, {}, { nullptr, false }, "The name of a settable internal debugger variable. Type 'settings list' to see a complete list of such variables." }, + { lldb::eArgTypeShlibName, "shlib-name", lldb::CompletionType::eDiskFileCompletion, {}, { nullptr, false }, "The name of a shared library." }, { lldb::eArgTypeSourceFile, "source-file", lldb::eSourceFileCompletion, {}, { nullptr, false }, "The name of a source file.." }, { lldb::eArgTypeSortOrder, "sort-order", lldb::CompletionType::eNoCompletion, g_sort_option_enumeration, { nullptr, false }, "Specify a sort order when dumping lists." }, { lldb::eArgTypeStartAddress, "start-address", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Help text goes here." }, { lldb::eArgTypeSummaryString, "summary-string", lldb::CompletionType::eNoCompletion, {}, { SummaryStringHelpTextCallback, true }, nullptr }, { lldb::eArgTypeSymbol, "symbol", lldb::eSymbolCompletion, {}, { nullptr, false }, "Any symbol name (function name, variable, argument, etc.)" }, - { lldb::eArgTypeThreadID, "thread-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Thread ID number." }, - { lldb::eArgTypeThreadIndex, "thread-index", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Index into the process' list of threads." }, + { lldb::eArgTypeThreadID, "thread-id", lldb::CompletionType::eThreadIndexCompletion, {}, { nullptr, false }, "Thread ID number." }, + { lldb::eArgTypeThreadIndex, "thread-index", lldb::CompletionType::eThreadIndexCompletion, {}, { nullptr, false }, "Index into the process' list of threads." }, { lldb::eArgTypeThreadName, "thread-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The thread's name." }, { lldb::eArgTypeTypeName, "type-name", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A type name." }, { lldb::eArgTypeUnsignedInteger, "unsigned-integer", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "An unsigned integer." }, { lldb::eArgTypeUnixSignal, "unix-signal", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A valid Unix signal name or number (e.g. SIGKILL, KILL or 9)." }, - { lldb::eArgTypeVarName, "variable-name", lldb::CompletionType::eNoCompletion, {} ,{ nullptr, false }, "The name of a variable in your program." }, + { lldb::eArgTypeVarName, "variable-name", lldb::CompletionType::eVariablePathCompletion, {} ,{ nullptr, false }, "The name of a variable in your program." }, { lldb::eArgTypeValue, "value", lldb::CompletionType::eNoCompletion, g_dependents_enumeration, { nullptr, false }, "A value could be anything, depending on where and how it is used." }, { lldb::eArgTypeWidth, "width", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "Help text goes here." }, { lldb::eArgTypeNone, "none", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "No help available for this." }, @@ -302,8 +302,11 @@ static constexpr CommandObject::ArgumentTableEntry g_argument_table[] = { { lldb::eArgTypeRecognizerID, "frame-recognizer-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The ID for a stack frame recognizer." }, { lldb::eArgTypeConnectURL, "process-connect-url", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "A URL-style specification for a remote connection." }, { lldb::eArgTypeTargetID, "target-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The index ID for an lldb Target." }, - { lldb::eArgTypeStopHookID, "stop-hook-id", lldb::CompletionType::eNoCompletion, {}, { nullptr, false }, "The ID you receive when you create a stop-hook." }, + { lldb::eArgTypeStopHookID, "stop-hook-id", lldb::CompletionType::eStopHookIDCompletion, {}, { nullptr, false }, "The ID you receive when you create a stop-hook." }, { lldb::eArgTypeCompletionType, "completion-type", lldb::CompletionType::eNoCompletion, g_completion_type, { nullptr, false }, "The completion type to use when adding custom commands. If none is specified, the command won't use auto-completion." }, + { lldb::eArgTypeRemotePath, "remote-path", lldb::CompletionType::eRemoteDiskFileCompletion, {}, { nullptr, false }, "A path on the system managed by the current platform." }, + { lldb::eArgTypeRemoteFilename, "remote-filename", lldb::CompletionType::eRemoteDiskFileCompletion, {}, { nullptr, false }, "A file on the system managed by the current platform." }, + { lldb::eArgTypeModule, "module", lldb::CompletionType::eModuleCompletion, {}, { nullptr, false }, "The name of a module loaded into the current target." }, // clang-format on }; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 4640533047833..85769071dae78 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -651,6 +651,9 @@ enum CommandArgumentType { eArgTypeTargetID, eArgTypeStopHookID, eArgTypeCompletionType, + eArgTypeRemotePath, + eArgTypeRemoteFilename, + eArgTypeModule, eArgTypeLastArg // Always keep this entry as the last entry in this // enumeration!! }; diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp index b7cd65059b221..7c459bdaf3802 100644 --- a/lldb/source/Commands/CommandObjectCommands.cpp +++ b/lldb/source/Commands/CommandObjectCommands.cpp @@ -63,13 +63,6 @@ class CommandObjectCommandsSource : public CommandObjectParsed { return std::string(""); } - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - Options *GetOptions() override { return &m_options; } protected: @@ -1968,13 +1961,6 @@ class CommandObjectCommandsScriptImport : public CommandObjectParsed { ~CommandObjectCommandsScriptImport() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - Options *GetOptions() override { return &m_options; } protected: diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp index 695f3d7931cd0..fb2cc106ffd2d 100644 --- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp +++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp @@ -52,12 +52,6 @@ CommandObjectDWIMPrint::CommandObjectDWIMPrint(CommandInterpreter &interpreter) Options *CommandObjectDWIMPrint::GetOptions() { return &m_option_group; } -void CommandObjectDWIMPrint::HandleArgumentCompletion( - CompletionRequest &request, OptionElementVector &opt_element_vector) { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eVariablePathCompletion, request, nullptr); -} - void CommandObjectDWIMPrint::DoExecute(StringRef command, CommandReturnObject &result) { m_option_group.NotifyOptionParsingStarting(&m_exe_ctx); diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.h b/lldb/source/Commands/CommandObjectDWIMPrint.h index d868f8964c2ac..01ba9c225e330 100644 --- a/lldb/source/Commands/CommandObjectDWIMPrint.h +++ b/lldb/source/Commands/CommandObjectDWIMPrint.h @@ -39,10 +39,6 @@ class CommandObjectDWIMPrint : public CommandObjectRaw { bool WantsCompletion() override { return true; } - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override; - private: void DoExecute(llvm::StringRef command, CommandReturnObject &result) override; diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index a4d3fb66e8b55..f092d54ffe993 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -286,16 +286,6 @@ class CommandObjectFrameSelect : public CommandObjectParsed { ~CommandObjectFrameSelect() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex() != 0) - return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eFrameIndexCompletion, request, nullptr); - } - Options *GetOptions() override { return &m_options; } protected: @@ -446,15 +436,6 @@ may even involve JITing and running code in the target program.)"); Options *GetOptions() override { return &m_option_group; } - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - // Arguments are the standard source file completer. - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eVariablePathCompletion, request, - nullptr); - } - protected: llvm::StringRef GetScopeString(VariableSP var_sp) { if (!var_sp) diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp index 790f1dbb47535..b25c391bd4faa 100644 --- a/lldb/source/Commands/CommandObjectPlatform.cpp +++ b/lldb/source/Commands/CommandObjectPlatform.cpp @@ -418,7 +418,7 @@ class CommandObjectPlatformMkDir : public CommandObjectParsed { : CommandObjectParsed(interpreter, "platform mkdir", "Make a new directory on the remote end.", nullptr, 0) { - CommandArgumentData thread_arg{eArgTypePath, eArgRepeatPlain}; + CommandArgumentData thread_arg{eArgTypeRemotePath, eArgRepeatPlain}; m_arguments.push_back({thread_arg}); } @@ -467,21 +467,12 @@ class CommandObjectPlatformFOpen : public CommandObjectParsed { CommandObjectPlatformFOpen(CommandInterpreter &interpreter) : CommandObjectParsed(interpreter, "platform file open", "Open a file on the remote end.", nullptr, 0) { - CommandArgumentData path_arg{eArgTypePath, eArgRepeatPlain}; + CommandArgumentData path_arg{eArgTypeRemotePath, eArgRepeatPlain}; m_arguments.push_back({path_arg}); } ~CommandObjectPlatformFOpen() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex() == 0) - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request, - nullptr); - } - void DoExecute(Args &args, CommandReturnObject &result) override { PlatformSP platform_sp( GetDebugger().GetPlatformList().GetSelectedPlatform()); @@ -795,7 +786,7 @@ class CommandObjectPlatformGetFile : public CommandObjectParsed { CommandArgumentData file_arg_remote, file_arg_host; // Define the first (and only) variant of this arg. - file_arg_remote.arg_type = eArgTypeFilename; + file_arg_remote.arg_type = eArgTypeRemoteFilename; file_arg_remote.arg_repetition = eArgRepeatPlain; // There is only one variant this argument could be; put it into the // argument entry. @@ -876,7 +867,7 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed { CommandArgumentData file_arg_remote; // Define the first (and only) variant of this arg. - file_arg_remote.arg_type = eArgTypeFilename; + file_arg_remote.arg_type = eArgTypeRemoteFilename; file_arg_remote.arg_repetition = eArgRepeatPlain; // There is only one variant this argument could be; put it into the // argument entry. @@ -888,17 +879,6 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed { ~CommandObjectPlatformGetSize() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex() != 0) - return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request, - nullptr); - } - void DoExecute(Args &args, CommandReturnObject &result) override { // If the number of arguments is incorrect, issue an error message. if (args.GetArgumentCount() != 1) { @@ -946,7 +926,7 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed { CommandArgumentData file_arg_remote; // Define the first (and only) variant of this arg. - file_arg_remote.arg_type = eArgTypeFilename; + file_arg_remote.arg_type = eArgTypeRemoteFilename; file_arg_remote.arg_repetition = eArgRepeatPlain; // There is only one variant this argument could be; put it into the // argument entry. @@ -958,17 +938,6 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed { ~CommandObjectPlatformGetPermissions() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex() != 0) - return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request, - nullptr); - } - void DoExecute(Args &args, CommandReturnObject &result) override { // If the number of arguments is incorrect, issue an error message. if (args.GetArgumentCount() != 1) { @@ -1015,7 +984,7 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed { CommandArgumentData file_arg_remote; // Define the first (and only) variant of this arg. - file_arg_remote.arg_type = eArgTypeFilename; + file_arg_remote.arg_type = eArgTypeRemoteFilename; file_arg_remote.arg_repetition = eArgRepeatPlain; // There is only one variant this argument could be; put it into the // argument entry. @@ -1027,17 +996,6 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed { ~CommandObjectPlatformFileExists() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex() != 0) - return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request, - nullptr); - } - void DoExecute(Args &args, CommandReturnObject &result) override { // If the number of arguments is incorrect, issue an error message. if (args.GetArgumentCount() != 1) { @@ -1080,7 +1038,7 @@ class CommandObjectPlatformPutFile : public CommandObjectParsed { Omitting the destination places the file in the platform working directory.)"); CommandArgumentData source_arg{eArgTypePath, eArgRepeatPlain}; - CommandArgumentData path_arg{eArgTypePath, eArgRepeatOptional}; + CommandArgumentData path_arg{eArgTypeRemotePath, eArgRepeatOptional}; m_arguments.push_back({source_arg}); m_arguments.push_back({path_arg}); } @@ -1139,6 +1097,16 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed { m_arguments.push_back({run_arg_arg}); } + void + HandleArgumentCompletion(CompletionRequest &request, + OptionElementVector &opt_element_vector) override { + // I didn't make a type for RemoteRunArgs, but since we're going to run + // this on the remote system we should use the remote completer. + lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), lldb::eRemoteDiskFileCompletion, request, + nullptr); + } + ~CommandObjectPlatformProcessLaunch() override = default; Options *GetOptions() override { return &m_all_options; } @@ -1552,13 +1520,6 @@ class CommandObjectPlatformProcessInfo : public CommandObjectParsed { ~CommandObjectPlatformProcessInfo() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eProcessIDCompletion, request, nullptr); - } - protected: void DoExecute(Args &args, CommandReturnObject &result) override { Target *target = GetDebugger().GetSelectedTarget().get(); @@ -1850,7 +1811,7 @@ class CommandObjectPlatformInstall : public CommandObjectParsed { "Install a target (bundle or executable file) to the remote end.", "platform target-install ", 0) { CommandArgumentData local_arg{eArgTypePath, eArgRepeatPlain}; - CommandArgumentData remote_arg{eArgTypePath, eArgRepeatPlain}; + CommandArgumentData remote_arg{eArgTypeRemotePath, eArgRepeatPlain}; m_arguments.push_back({local_arg}); m_arguments.push_back({remote_arg}); } diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp index f22885144b09b..da3b5f0518a69 100644 --- a/lldb/source/Commands/CommandObjectPlugin.cpp +++ b/lldb/source/Commands/CommandObjectPlugin.cpp @@ -36,13 +36,6 @@ class CommandObjectPluginLoad : public CommandObjectParsed { ~CommandObjectPluginLoad() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - protected: void DoExecute(Args &command, CommandReturnObject &result) override { size_t argc = command.GetArgumentCount(); diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 722b0e0c376be..7cd5ad656f1b0 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -143,14 +143,6 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach { ~CommandObjectProcessLaunch() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - Options *GetOptions() override { return &m_all_options; } std::optional GetRepeatCommand(Args ¤t_command_args, @@ -1015,9 +1007,7 @@ class CommandObjectProcessLoad : public CommandObjectParsed { OptionElementVector &opt_element_vector) override { if (!m_exe_ctx.HasProcessScope()) return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); + CommandObject::HandleArgumentCompletion(request, opt_element_vector); } Options *GetOptions() override { return &m_options; } @@ -1292,13 +1282,6 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed { Options *GetOptions() override { return &m_options; } - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - class CommandOptions : public Options { public: CommandOptions() = default; diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp index a4d53e5c8dd5f..4ffdde1ee09f9 100644 --- a/lldb/source/Commands/CommandObjectRegister.cpp +++ b/lldb/source/Commands/CommandObjectRegister.cpp @@ -80,9 +80,7 @@ class CommandObjectRegisterRead : public CommandObjectParsed { OptionElementVector &opt_element_vector) override { if (!m_exe_ctx.HasProcessScope()) return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eRegisterCompletion, request, nullptr); + CommandObject::HandleArgumentCompletion(request, opt_element_vector); } Options *GetOptions() override { return &m_option_group; } @@ -440,8 +438,7 @@ different for the same register when connected to different debug servers.)"); OptionElementVector &opt_element_vector) override { if (!m_exe_ctx.HasProcessScope() || request.GetCursorIndex() != 0) return; - CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eRegisterCompletion, request, nullptr); + CommandObject::HandleArgumentCompletion(request, opt_element_vector); } protected: diff --git a/lldb/source/Commands/CommandObjectSession.cpp b/lldb/source/Commands/CommandObjectSession.cpp index d140bdfdba57b..28506d6c59512 100644 --- a/lldb/source/Commands/CommandObjectSession.cpp +++ b/lldb/source/Commands/CommandObjectSession.cpp @@ -28,13 +28,6 @@ class CommandObjectSessionSave : public CommandObjectParsed { ~CommandObjectSessionSave() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - protected: void DoExecute(Args &args, CommandReturnObject &result) override { llvm::StringRef file_path; diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp index 5fb7dcc80279f..0cf3d1daf7f52 100644 --- a/lldb/source/Commands/CommandObjectSettings.cpp +++ b/lldb/source/Commands/CommandObjectSettings.cpp @@ -262,14 +262,6 @@ class CommandObjectSettingsShow : public CommandObjectParsed { ~CommandObjectSettingsShow() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eSettingsNameCompletion, request, - nullptr); - } - protected: void DoExecute(Args &args, CommandReturnObject &result) override { result.SetStatus(eReturnStatusSuccessFinishResult); diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index c3ecdb7700c25..4e006e4bb0e0f 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -257,13 +257,6 @@ class CommandObjectTargetCreate : public CommandObjectParsed { Options *GetOptions() override { return &m_option_group; } - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - protected: void DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -2789,13 +2782,6 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed { Options *GetOptions() override { return &m_option_group; } - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - protected: OptionGroupOptions m_option_group; OptionGroupUUID m_uuid_option_group; @@ -3233,7 +3219,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed { : CommandObjectParsed( interpreter, "target modules list", "List current executable and dependent shared library images.") { - CommandArgumentData module_arg{eArgTypeShlibName, eArgRepeatStar}; + CommandArgumentData module_arg{eArgTypeModule, eArgRepeatStar}; m_arguments.push_back({module_arg}); } @@ -4343,13 +4329,6 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed { ~CommandObjectTargetSymbolsAdd() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr); - } - Options *GetOptions() override { return &m_option_group; } protected: @@ -5195,8 +5174,7 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed { OptionElementVector &opt_element_vector) override { if (request.GetCursorIndex()) return; - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eStopHookIDCompletion, request, nullptr); + CommandObject::HandleArgumentCompletion(request, opt_element_vector); } protected: @@ -5251,8 +5229,7 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed { OptionElementVector &opt_element_vector) override { if (request.GetCursorIndex()) return; - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eStopHookIDCompletion, request, nullptr); + CommandObject::HandleArgumentCompletion(request, opt_element_vector); } protected: diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp index a1e7e3f11361e..52e493b13c61c 100644 --- a/lldb/source/Commands/CommandObjectThread.cpp +++ b/lldb/source/Commands/CommandObjectThread.cpp @@ -403,10 +403,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed { OptionElementVector &opt_element_vector) override { if (request.GetCursorIndex()) return; - - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eThreadIndexCompletion, request, - nullptr); + CommandObject::HandleArgumentCompletion(request, opt_element_vector); } Options *GetOptions() override { return &m_all_options; } @@ -663,14 +660,6 @@ class CommandObjectThreadContinue : public CommandObjectParsed { ~CommandObjectThreadContinue() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eThreadIndexCompletion, request, - nullptr); - } - void DoExecute(Args &command, CommandReturnObject &result) override { bool synchronous_execution = m_interpreter.GetSynchronous(); diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp index f76420f3cc683..036b8e9d9def1 100644 --- a/lldb/source/Commands/CommandObjectType.cpp +++ b/lldb/source/Commands/CommandObjectType.cpp @@ -1758,14 +1758,6 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed { ~CommandObjectTypeCategoryDefine() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request, - nullptr); - } - protected: void DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -1859,14 +1851,6 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed { ~CommandObjectTypeCategoryEnable() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request, - nullptr); - } - protected: void DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -1926,14 +1910,6 @@ class CommandObjectTypeCategoryDelete : public CommandObjectParsed { ~CommandObjectTypeCategoryDelete() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request, - nullptr); - } - protected: void DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); @@ -2033,14 +2009,6 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed { ~CommandObjectTypeCategoryDisable() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eTypeCategoryNameCompletion, request, - nullptr); - } - protected: void DoExecute(Args &command, CommandReturnObject &result) override { const size_t argc = command.GetArgumentCount(); diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp index 438a16c50bd67..5b74b1ae43acc 100644 --- a/lldb/source/Commands/CommandObjectWatchpoint.cpp +++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp @@ -831,16 +831,6 @@ corresponding to the byte size of the data type."); ~CommandObjectWatchpointSetVariable() override = default; - void - HandleArgumentCompletion(CompletionRequest &request, - OptionElementVector &opt_element_vector) override { - if (request.GetCursorIndex() != 0) - return; - lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( - GetCommandInterpreter(), lldb::eVariablePathCompletion, request, - nullptr); - } - Options *GetOptions() override { return &m_option_group; } protected: diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp index 6ed0fd1f1ddbd..93c53e89f7d1a 100644 --- a/lldb/source/Interpreter/CommandObject.cpp +++ b/lldb/source/Interpreter/CommandObject.cpp @@ -305,6 +305,43 @@ void CommandObject::HandleCompletion(CompletionRequest &request) { } } +void CommandObject::HandleArgumentCompletion( + CompletionRequest &request, OptionElementVector &opt_element_vector) { + size_t num_arg_entries = GetNumArgumentEntries(); + if (num_arg_entries != 1) + return; + + CommandArgumentEntry *entry_ptr = GetArgumentEntryAtIndex(0); + if (!entry_ptr) { + assert(entry_ptr && "We said there was one entry, but there wasn't."); + return; // Not worth crashing if asserts are off... + } + + CommandArgumentEntry &entry = *entry_ptr; + // For now, we only handle the simple case of one homogenous argument type. + if (entry.size() != 1) + return; + + // Look up the completion type, and if it has one, invoke it: + const CommandObject::ArgumentTableEntry *arg_entry = + FindArgumentDataByType(entry[0].arg_type); + const ArgumentRepetitionType repeat = entry[0].arg_repetition; + + if (arg_entry == nullptr || arg_entry->completion_type == lldb::eNoCompletion) + return; + + // FIXME: This should be handled higher in the Command Parser. + // Check the case where this command only takes one argument, and don't do + // the completion if we aren't on the first entry: + if (repeat == eArgRepeatPlain && request.GetCursorIndex() != 0) + return; + + lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks( + GetCommandInterpreter(), arg_entry->completion_type, request, nullptr); + +} + + bool CommandObject::HelpTextContainsWord(llvm::StringRef search_word, bool search_short_help, bool search_long_help, diff --git a/lldb/test/API/commands/help/TestHelp.py b/lldb/test/API/commands/help/TestHelp.py index 95ffdb3cc8b18..f0f5bcb321801 100644 --- a/lldb/test/API/commands/help/TestHelp.py +++ b/lldb/test/API/commands/help/TestHelp.py @@ -104,7 +104,7 @@ def test_help_image_du_line_should_work(self): def test_help_image_list_shows_positional_args(self): """Command 'help image list' should describe positional args.""" # 'image' is an alias for 'target modules'. - self.expect("help image list", substrs=[" [...]"]) + self.expect("help image list", substrs=[" [...]"]) @no_debug_info_test def test_help_target_variable_syntax(self): diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py index f71bc73928f0f..b4681062a7d4e 100644 --- a/lldb/test/API/functionalities/completion/TestCompletion.py +++ b/lldb/test/API/functionalities/completion/TestCompletion.py @@ -787,6 +787,27 @@ def test_register_read_and_write_on_x86(self): # register write can only take exact one register name as argument self.complete_from_to("register write rbx ", []) + def test_register_read_and_write_generic(self): + """Test the completion of the commands register read and write on x86""" + + self.build() + self.main_source_spec = lldb.SBFileSpec("main.cpp") + lldbutil.run_to_source_breakpoint(self, "// Break here", self.main_source_spec) + + # test cases for register read + self.complete_from_to("register read f", ["fp"]) + # register read can take multiple register names as arguments + self.complete_from_to("register read sp ", ["sp", "fp"]) + # complete with prefix '$' + self.complete_from_to("register read sp $", ["$sp", "$fp"]) + self.complete_from_to("register read $x0 ", ["sp", "fp"]) + + # test cases for register write + self.complete_from_to("register write ", ["fp", "sp"]) + self.complete_from_to("register write f", ["fp"]) + # register write can only take exact one register name as argument + self.complete_from_to("register write fp ", []) + def test_common_completion_target_stophook_ids(self): subcommands = ["delete", "enable", "disable"] From be8b2d1ea54f964603b89ab9d4dfad26afebb347 Mon Sep 17 00:00:00 2001 From: Moshe Date: Tue, 20 Feb 2024 17:52:07 -0500 Subject: [PATCH 021/351] Add explicit conversion to fix arm64 builds. (#82429) Fixes issue preventing builds on ARM-based Macs. https://github.com/llvm/llvm-project/issues/82205. Co-authored-by: Moshe Berman --- libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h index ea1fd68a5fcdf..fd915373020ec 100644 --- a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h +++ b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h @@ -161,8 +161,8 @@ LIBC_INLINE int set_except(int excepts) { LIBC_INLINE int raise_except(int excepts) { float zero = 0.0f; float one = 1.0f; - float large_value = FPBits::max_normal(); - float small_value = FPBits::min_normal(); + float large_value = FPBits::max_normal().get_val(); + float small_value = FPBits::min_normal().get_val(); auto divfunc = [](float a, float b) { __asm__ __volatile__("ldr s0, %0\n\t" "ldr s1, %1\n\t" @@ -277,8 +277,8 @@ LIBC_INLINE int set_env(const fenv_t *envp) { return 0; } const FEnv::FPState *state = reinterpret_cast(envp); - FEnv::set_control_word(state->ControlWord); - FEnv::set_status_word(state->StatusWord); + FEnv::set_control_word(static_cast(state->ControlWord)); + FEnv::set_status_word(static_cast(state->StatusWord)); return 0; } From 2236048f5fdde70dd95e97ccc87437424a371cef Mon Sep 17 00:00:00 2001 From: Peter Klausler <35819229+klausler@users.noreply.github.com> Date: Tue, 20 Feb 2024 14:52:25 -0800 Subject: [PATCH 022/351] [flang] Further refine errors vs warnings for ambiguous generics (#80161) Ensure that the compiler emits a hard error for a generic interface with ambiguous specific procedures when it is declared as such, and the ambiguity doesn't involve optional or unlimited polymorphic dummy data arguments. But: emit an optional portability warning when the ambiguity in the generic interface is due to USE association's merging of multiple generics, as USE association may involve modules not under control of the programmer; we'll emit a hard error message if any the actual arguments in a particular reference to the generic procedure doesn't resolve to exactly one specific procedure. And don't emit warnings when potential ambiguity due to USE association is taking place in a module file; the warnings, if any, will have been produced when the module file was compiled. --- flang/lib/Semantics/check-declarations.cpp | 44 ++++++++++++++-------- flang/test/Semantics/resolve17.f90 | 2 +- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 816227fb3354f..2db3f9a27d8f4 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -192,7 +192,7 @@ class DistinguishabilityHelper { private: void SayNotDistinguishable(const Scope &, const SourceName &, GenericKind, - const Symbol &, const Symbol &, bool isError); + const Symbol &, const Symbol &, bool isHardConflict); void AttachDeclaration(parser::Message &, const Scope &, const Symbol &); SemanticsContext &context_; @@ -3513,6 +3513,11 @@ void DistinguishabilityHelper::Add(const Symbol &generic, GenericKind kind, } void DistinguishabilityHelper::Check(const Scope &scope) { + if (FindModuleFileContaining(scope)) { + // Distinguishability was checked when the module was created; + // don't let optional warnings then become errors now. + return; + } for (const auto &[name, info] : nameToSpecifics_) { for (auto iter1{info.begin()}; iter1 != info.end(); ++iter1) { const auto &[ultimate, procInfo]{*iter1}; @@ -3534,15 +3539,21 @@ void DistinguishabilityHelper::Check(const Scope &scope) { void DistinguishabilityHelper::SayNotDistinguishable(const Scope &scope, const SourceName &name, GenericKind kind, const Symbol &proc1, - const Symbol &proc2, bool isError) { - if (!isError && - !context_.ShouldWarn( - common::LanguageFeature::IndistinguishableSpecifics)) { - // The rules for distinguishing specific procedures (F'2023 15.4.3.4.5) - // are inadequate for some real-world cases like pFUnit. - // When there are optional dummy arguments or unlimited polymorphic - // dummy data object arguments, the best that we can do is emit an optional - // portability warning. + const Symbol &proc2, bool isHardConflict) { + bool isUseAssociated{!scope.sourceRange().Contains(name)}; + // The rules for distinguishing specific procedures (F'2023 15.4.3.4.5) + // are inadequate for some real-world cases like pFUnit. + // When there are optional dummy arguments or unlimited polymorphic + // dummy data object arguments, the best that we can do is emit an optional + // portability warning. Also, named generics created by USE association + // merging shouldn't receive hard errors for ambiguity. + // (Non-named generics might be defined I/O procedures or defined + // assignments that need to be used by the runtime.) + bool isWarning{!isHardConflict || (isUseAssociated && kind.IsName())}; + if (isWarning && + (!context_.ShouldWarn( + common::LanguageFeature::IndistinguishableSpecifics) || + FindModuleFileContaining(scope))) { return; } std::string name1{proc1.name().ToString()}; @@ -3557,17 +3568,20 @@ void DistinguishabilityHelper::SayNotDistinguishable(const Scope &scope, } } parser::Message *msg; - if (scope.sourceRange().Contains(name)) { + if (!isUseAssociated) { + CHECK(isWarning == !isHardConflict); msg = &context_.Say(name, - isError + isHardConflict ? "Generic '%s' may not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_err_en_US : "Generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable by the rules in the standard"_port_en_US, MakeOpName(name), name1, name2); } else { msg = &context_.Say(*GetTopLevelUnitContaining(proc1).GetName(), - isError - ? "USE-associated generic '%s' may not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_err_en_US - : "USE-associated generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable by the incomplete rules in the standard"_port_en_US, + isHardConflict + ? (isWarning + ? "USE-associated generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_warn_en_US + : "USE-associated generic '%s' may not have specific procedures '%s' and '%s' as their interfaces are not distinguishable"_err_en_US) + : "USE-associated generic '%s' should not have specific procedures '%s' and '%s' as their interfaces are not distinguishable by the rules in the standard"_port_en_US, MakeOpName(name), name1, name2); } AttachDeclaration(*msg, scope, proc1); diff --git a/flang/test/Semantics/resolve17.f90 b/flang/test/Semantics/resolve17.f90 index 513676fe670a1..770af756d03bc 100644 --- a/flang/test/Semantics/resolve17.f90 +++ b/flang/test/Semantics/resolve17.f90 @@ -180,7 +180,7 @@ subroutine g() end end module subroutine s9 - !ERROR: USE-associated generic 'g' may not have specific procedures 'g' and 'g' as their interfaces are not distinguishable + !PORTABILITY: USE-associated generic 'g' should not have specific procedures 'g' and 'g' as their interfaces are not distinguishable use m9a use m9b end From 39cab1a0a0d68cb33142099c320674fa54e11a91 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 20 Feb 2024 14:55:44 -0800 Subject: [PATCH 023/351] [AMDGPU] Add v2bf16 for opsel immediate folding (#82435) This was previously enabled since v2bf16 was represented by v2f16. As of now it is NFC since we only have dot instructions which could use it, but currently folding is guarded by the hasDOTOpSelHazard(). --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 8bf05682cbe7e..d16d8ebd41a54 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -219,8 +219,10 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { default: return false; case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: break; } From 54b014b3f76e1c0060bd129e1196b6c729cb30b0 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Tue, 20 Feb 2024 15:03:53 -0800 Subject: [PATCH 024/351] [llvm-jitlink] Use '@' rather than ':' for separator in -sectcreate. This should avoid the issue with Windows paths that have caused failures on some builders. --- llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test | 2 +- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test index 244827196f485..33ad5515a6357 100644 --- a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test +++ b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test @@ -1,6 +1,6 @@ # RUN: llc -filetype=obj -o %t.o %S/Inputs/main-ret-0.ll # RUN: llvm-jitlink -noexec \ -# RUN: -sectcreate __data,%S/Inputs/sectcreate-data.txt:foo=0 \ +# RUN: -sectcreate __data,%S/Inputs/sectcreate-data.txt@foo=0 \ # RUN: %t.o # # Use -sectcreate to create a section from a data file. diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index f6280779ded10..f0b8310a32efd 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -165,7 +165,7 @@ static cl::list static cl::list SectCreate("sectcreate", - cl::desc("given ,[:=,...] " + cl::desc("given ,[@=,...] " "add the content of to "), cl::cat(JITLinkCategory)); @@ -1683,7 +1683,7 @@ static Error addSectCreates(Session &S, StringRef SCArg(*SCItr); - auto [SectAndFileName, ExtraSymbolsString] = SCArg.split(':'); + auto [SectAndFileName, ExtraSymbolsString] = SCArg.split('@'); auto [SectName, FileName] = SectAndFileName.rsplit(','); if (SectName.empty()) return make_error("In -sectcreate=" + SCArg + From 3ff805540173b83d73b673b39ac5760fc19bac15 Mon Sep 17 00:00:00 2001 From: Michael Spencer Date: Tue, 20 Feb 2024 15:20:40 -0800 Subject: [PATCH 025/351] [clang][ScanDeps] Canonicalize -D and -U flags (#82298) Canonicalize `-D` and `-U` flags by sorting them and only keeping the last instance of a given name. This optimization will only fire if all `-D` and `-U` flags start with a simple identifier that we can guarantee a simple analysis of can determine if two flags refer to the same identifier or not. See the comment on `getSimpleMacroName()` for details of what the issues are. --- .../DependencyScanningService.h | 5 +- .../DependencyScanningWorker.cpp | 74 ++++++++++++++++ .../optimize-canonicalize-macros.m | 87 +++++++++++++++++++ clang/tools/clang-scan-deps/ClangScanDeps.cpp | 1 + 4 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 clang/test/ClangScanDeps/optimize-canonicalize-macros.m diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h index 4f9867262a275..557f0e547ab4a 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h @@ -60,7 +60,10 @@ enum class ScanningOptimizations { /// Remove unused -ivfsoverlay arguments. VFS = 4, - DSS_LAST_BITMASK_ENUM(VFS), + /// Canonicalize -D and -U options. + Macros = 8, + + DSS_LAST_BITMASK_ENUM(Macros), Default = All }; diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 3cf3ad8a4e490..7477b930188b4 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -179,6 +179,78 @@ static void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) { DiagOpts.IgnoreWarnings = true; } +// Clang implements -D and -U by splatting text into a predefines buffer. This +// allows constructs such as `-DFඞ=3 "-D F\u{0D9E} 4 3 2”` to be accepted and +// define the same macro, or adding C++ style comments before the macro name. +// +// This function checks that the first non-space characters in the macro +// obviously form an identifier that can be uniqued on without lexing. Failing +// to do this could lead to changing the final definition of a macro. +// +// We could set up a preprocessor and actually lex the name, but that's very +// heavyweight for a situation that will almost never happen in practice. +static std::optional getSimpleMacroName(StringRef Macro) { + StringRef Name = Macro.split("=").first.ltrim(" \t"); + std::size_t I = 0; + + auto FinishName = [&]() -> std::optional { + StringRef SimpleName = Name.slice(0, I); + if (SimpleName.empty()) + return std::nullopt; + return SimpleName; + }; + + for (; I != Name.size(); ++I) { + switch (Name[I]) { + case '(': // Start of macro parameter list + case ' ': // End of macro name + case '\t': + return FinishName(); + case '_': + continue; + default: + if (llvm::isAlnum(Name[I])) + continue; + return std::nullopt; + } + } + return FinishName(); +} + +static void canonicalizeDefines(PreprocessorOptions &PPOpts) { + using MacroOpt = std::pair; + std::vector SimpleNames; + SimpleNames.reserve(PPOpts.Macros.size()); + std::size_t Index = 0; + for (const auto &M : PPOpts.Macros) { + auto SName = getSimpleMacroName(M.first); + // Skip optimizing if we can't guarantee we can preserve relative order. + if (!SName) + return; + SimpleNames.emplace_back(*SName, Index); + ++Index; + } + + llvm::stable_sort(SimpleNames, [](const MacroOpt &A, const MacroOpt &B) { + return A.first < B.first; + }); + // Keep the last instance of each macro name by going in reverse + auto NewEnd = std::unique( + SimpleNames.rbegin(), SimpleNames.rend(), + [](const MacroOpt &A, const MacroOpt &B) { return A.first == B.first; }); + SimpleNames.erase(SimpleNames.begin(), NewEnd.base()); + + // Apply permutation. + decltype(PPOpts.Macros) NewMacros; + NewMacros.reserve(SimpleNames.size()); + for (std::size_t I = 0, E = SimpleNames.size(); I != E; ++I) { + std::size_t OriginalIndex = SimpleNames[I].second; + // We still emit undefines here as they may be undefining a predefined macro + NewMacros.push_back(std::move(PPOpts.Macros[OriginalIndex])); + } + std::swap(PPOpts.Macros, NewMacros); +} + /// A clang tool that runs the preprocessor in a mode that's optimized for /// dependency scanning for the given compiler invocation. class DependencyScanningAction : public tooling::ToolAction { @@ -203,6 +275,8 @@ class DependencyScanningAction : public tooling::ToolAction { CompilerInvocation OriginalInvocation(*Invocation); // Restore the value of DisableFree, which may be modified by Tooling. OriginalInvocation.getFrontendOpts().DisableFree = DisableFree; + if (any(OptimizeArgs & ScanningOptimizations::Macros)) + canonicalizeDefines(OriginalInvocation.getPreprocessorOpts()); if (Scanned) { // Scanning runs once for the first -cc1 invocation in a chain of driver diff --git a/clang/test/ClangScanDeps/optimize-canonicalize-macros.m b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m new file mode 100644 index 0000000000000..2c9b06be39210 --- /dev/null +++ b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m @@ -0,0 +1,87 @@ +// This test verifies that command lines with equivalent -D and -U arguments +// are canonicalized to the same module variant. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands.json.in > %t/build/compile-commands.json +// RUN: clang-scan-deps -compilation-database %t/build/compile-commands.json \ +// RUN: -j 1 -format experimental-full -optimize-args=canonicalize-macros > %t/deps.db +// RUN: cat %t/deps.db | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t + +// Verify that there are only two variants and that the expected merges have +// happened. + +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK-NEXT: { +// CHECK-NEXT: "clang-module-deps": [], +// CHECK-NEXT: "clang-modulemap-file": +// CHECK-NEXT: "command-line": [ +// CHECK-NOT: "J=1" +// CHECK-NOT: "J" +// CHECK-NOT: "K" +// CHECK: ], +// CHECK-NEXT: "context-hash": "{{.*}}", +// CHECK-NEXT: "file-deps": [ +// CHECK: ], +// CHECK-NEXT: "name": "A" +// CHECK-NEXT: }, +// CHECK-NEXT: { +// CHECK-NEXT: "clang-module-deps": [], +// CHECK-NEXT: "clang-modulemap-file": +// CHECK-NEXT: "command-line": [ +// CHECK: "Fඞ" +// CHECK: "F\\u{0D9E}" +// CHECK: "K" +// CHECK: "K" +// CHECK: ], +// CHECK-NEXT: "context-hash": "{{.*}}", +// CHECK-NEXT: "file-deps": [ +// CHECK: ], +// CHECK-NEXT: "name": "A" +// CHECK-NEXT: } +// CHECK-NEXT: ], +// CHECK-NEXT: "translation-units": [ +// CHECK: ] +// CHECK: } + + +//--- build/compile-commands.json.in + +[ +{ + "directory": "DIR", + "command": "clang -c DIR/tu0.m -DJ=1 -UJ -DJ=2 -DI -DK(x)=x -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", + "file": "DIR/tu0.m" +}, +{ + "directory": "DIR", + "command": "clang -c DIR/tu1.m -DK -DK(x)=x -DI -D \"J=2\" -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", + "file": "DIR/tu1.m" +}, +{ + "directory": "DIR", + "command": "clang -c DIR/tu2.m -I modules/A -DFඞ '-DF\\u{0D9E}' -DK -DK -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", + "file": "DIR/tu2.m" +} +] + +//--- modules/A/module.modulemap + +module A { + umbrella header "A.h" +} + +//--- modules/A/A.h + +//--- tu0.m + +#include + +//--- tu1.m + +#include + +//--- tu2.m + +#include diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 0458a4b3ecec3..9811d2a875335 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -157,6 +157,7 @@ static void ParseArgs(int argc, char **argv) { .Case("header-search", ScanningOptimizations::HeaderSearch) .Case("system-warnings", ScanningOptimizations::SystemWarnings) .Case("vfs", ScanningOptimizations::VFS) + .Case("canonicalize-macros", ScanningOptimizations::Macros) .Case("all", ScanningOptimizations::All) .Default(std::nullopt); if (!Optimization) { From d3fcf310310ddfea1acf0d54bb7574ea2f6d9077 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 20 Feb 2024 16:03:54 -0800 Subject: [PATCH 026/351] AMDGPU: Use HasFP8ConversionInsts appropriately, NFC (#82433) The corresponding fp8 conversion instructions are available for a subtarget when and only when the subtarget "HasFP8ConversionInsts". We should not assume all the future subtargets (gfx12+) have FP8ConversionInsts. In this patch, we use OtherPredicates to carry HasFP8ConversionInsts feature. This is because SubtargetPredicate is not copied from pseudos to reals for DPP16 and DPP6. To avoid overriding OtherPredicates in a few places, we use the newly introduced True16Predicate to hold UseRealTrue16Insts instead. This work repalces the inadvertently closed pull request: https://github.com/llvm/llvm-project/pull/82024 --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 7 ++++--- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 +++- llvm/lib/Target/AMDGPU/VOPInstructions.td | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 0d4057b3ddd10..99f8e8ede4ace 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1,3 +1,4 @@ + //===-- VOP1Instructions.td - Vector Instruction Definitions --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. @@ -565,7 +566,7 @@ class VOPProfile_Base_CVT_F32_F8 : VOPProfileI2F { def VOPProfileCVT_F32_F8 : VOPProfile_Base_CVT_F32_F8 ; def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 ; -let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0, +let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { defm V_CVT_F32_FP8 : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>; defm V_CVT_F32_BF8 : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>; @@ -653,7 +654,7 @@ class Cvt_F32_F8_Pat_OpSel index, (inst_e32 $src)) >; -let SubtargetPredicate = isGFX12Plus in { +let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in { foreach Index = [0, 1, 2, 3] in { def : Cvt_F32_F8_Pat_OpSel; @@ -670,7 +671,7 @@ class Cvt_PK_F32_F8_Pat_OpSel; -let SubtargetPredicate = isGFX12Plus in { +let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in { foreach Index = [0, -1] in { def : Cvt_PK_F32_F8_Pat_OpSel; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 35cffa22f4592..e7b8a7b889f0f 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -640,7 +640,7 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; -let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0, +let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>; @@ -667,6 +667,7 @@ class Cvt_SR_F8_F32_Pat index, VOP3_Pseudo inst> !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0) >; +let OtherPredicates = [HasFP8ConversionInsts] in { foreach Index = [0, -1] in { def : Cvt_PK_F8_F32_Pat; def : Cvt_PK_F8_F32_Pat; @@ -676,6 +677,7 @@ foreach Index = [0, 1, 2, 3] in { def : Cvt_SR_F8_F32_Pat; def : Cvt_SR_F8_F32_Pat; } +} class ThreeOp_i32_Pats : GCNPat < // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c47c13dbb8402..801afabbdb140 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -210,7 +210,7 @@ class VOP3_Real : VOP3_Real { let AssemblerPredicate = Gen.AssemblerPredicate; - let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); + let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1349,7 +1349,7 @@ class VOP3_DPP16_Gen op, VOP_DPP_Pseudo ps, GFXGen Gen, string opName = ps.OpName> : VOP3_DPP16 { let AssemblerPredicate = Gen.AssemblerPredicate; - let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); + let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); let DecoderNamespace = "DPP"#Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } From 53e96984b6dbb9d8ff55d2ccd0c27ffc1d27315f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 20 Feb 2024 18:12:27 -0600 Subject: [PATCH 027/351] [NVPTX] Enable the _Float16 type for NVPTX compilation (#82436) Summary: The PTX target supports the f16 type natively and we alreaqdy have a few LLVM backend tests that support the LLVM-IR. We should be able to enable this for generic use. This is done prior the f16 math functions being written in the GPU libc case. --- clang/docs/LanguageExtensions.rst | 1 + clang/lib/Basic/Targets/NVPTX.cpp | 4 ++++ clang/test/SemaCUDA/float16.cu | 1 + 3 files changed, 6 insertions(+) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index fb4d7a02dd086..711baf45f449a 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -833,6 +833,7 @@ to ``float``; see below for more information on this emulation. * 32-bit ARM (natively on some architecture versions) * 64-bit ARM (AArch64) (natively on ARMv8.2a and above) * AMDGPU (natively) + * NVPTX (natively) * SPIR (natively) * X86 (if SSE2 is available; natively if AVX512-FP16 is also available) * RISC-V (natively if Zfh or Zhinx is available) diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index a8efae3a1ce38..b47c399fef604 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -61,6 +61,10 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, NoAsmVariants = true; GPU = CudaArch::UNUSED; + // PTX supports f16 as a fundamental type. + HasLegalHalfType = true; + HasFloat16 = true; + if (TargetPointerWidth == 32) resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); else if (Opts.NVPTXUseShortPointers) diff --git a/clang/test/SemaCUDA/float16.cu b/clang/test/SemaCUDA/float16.cu index a9cbe87f32c10..bb5ed60643849 100644 --- a/clang/test/SemaCUDA/float16.cu +++ b/clang/test/SemaCUDA/float16.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple nvptx64 -verify %s // expected-no-diagnostics #include "Inputs/cuda.h" From dc672d2f6a48fb3d502c260eb353f389723ec417 Mon Sep 17 00:00:00 2001 From: jimingham Date: Tue, 20 Feb 2024 16:28:48 -0800 Subject: [PATCH 028/351] Remove the "generic" register completion test. (#82445) For reasons that are not clear to me, on arm64, the alias registers are listed in list of register info's we do completion against, but for x86_64 they are not. Maybe this is a difference in how the dynamic register builders work for the two systems. Anyway, it doesn't look possible to make a generic one. --- .../completion/TestCompletion.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py index b4681062a7d4e..f71bc73928f0f 100644 --- a/lldb/test/API/functionalities/completion/TestCompletion.py +++ b/lldb/test/API/functionalities/completion/TestCompletion.py @@ -787,27 +787,6 @@ def test_register_read_and_write_on_x86(self): # register write can only take exact one register name as argument self.complete_from_to("register write rbx ", []) - def test_register_read_and_write_generic(self): - """Test the completion of the commands register read and write on x86""" - - self.build() - self.main_source_spec = lldb.SBFileSpec("main.cpp") - lldbutil.run_to_source_breakpoint(self, "// Break here", self.main_source_spec) - - # test cases for register read - self.complete_from_to("register read f", ["fp"]) - # register read can take multiple register names as arguments - self.complete_from_to("register read sp ", ["sp", "fp"]) - # complete with prefix '$' - self.complete_from_to("register read sp $", ["$sp", "$fp"]) - self.complete_from_to("register read $x0 ", ["sp", "fp"]) - - # test cases for register write - self.complete_from_to("register write ", ["fp", "sp"]) - self.complete_from_to("register write f", ["fp"]) - # register write can only take exact one register name as argument - self.complete_from_to("register write fp ", []) - def test_common_completion_target_stophook_ids(self): subcommands = ["delete", "enable", "disable"] From 646c7e528325f239638c5e758631b999993510d8 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 20 Feb 2024 16:29:17 -0800 Subject: [PATCH 029/351] [libc] add more stdbit.h entrypoints to additional targets (#82440) stdbit.h isn't complete yet, but looking to turn these on on more targets for earlier feedback. --- libc/config/baremetal/arm/entrypoints.txt | 43 +++++++++++++++++++++ libc/config/baremetal/riscv/entrypoints.txt | 42 ++++++++++++++++++++ libc/config/gpu/entrypoints.txt | 42 ++++++++++++++++++++ libc/config/linux/aarch64/entrypoints.txt | 35 +++++++++++++++++ libc/config/linux/arm/entrypoints.txt | 35 +++++++++++++++++ libc/config/linux/riscv/entrypoints.txt | 35 +++++++++++++++++ 6 files changed, 232 insertions(+) diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index f725b1c2394c6..608ac46034306 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -73,6 +73,49 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdio.vsprintf libc.src.stdio.vsnprintf + + # stdbit.h entrypoints + libc.src.stdbit.stdc_leading_zeros_uc + libc.src.stdbit.stdc_leading_zeros_us + libc.src.stdbit.stdc_leading_zeros_ui + libc.src.stdbit.stdc_leading_zeros_ul + libc.src.stdbit.stdc_leading_zeros_ull + libc.src.stdbit.stdc_leading_ones_uc + libc.src.stdbit.stdc_leading_ones_us + libc.src.stdbit.stdc_leading_ones_ui + libc.src.stdbit.stdc_leading_ones_ul + libc.src.stdbit.stdc_leading_ones_ull + libc.src.stdbit.stdc_trailing_zeros_uc + libc.src.stdbit.stdc_trailing_zeros_us + libc.src.stdbit.stdc_trailing_zeros_ui + libc.src.stdbit.stdc_trailing_zeros_ul + libc.src.stdbit.stdc_trailing_zeros_ull + libc.src.stdbit.stdc_trailing_ones_uc + libc.src.stdbit.stdc_trailing_ones_us + libc.src.stdbit.stdc_trailing_ones_ui + libc.src.stdbit.stdc_trailing_ones_ul + libc.src.stdbit.stdc_trailing_ones_ull + libc.src.stdbit.stdc_first_leading_zero_uc + libc.src.stdbit.stdc_first_leading_zero_us + libc.src.stdbit.stdc_first_leading_zero_ui + libc.src.stdbit.stdc_first_leading_zero_ul + libc.src.stdbit.stdc_first_leading_zero_ull + libc.src.stdbit.stdc_first_leading_one_uc + libc.src.stdbit.stdc_first_leading_one_us + libc.src.stdbit.stdc_first_leading_one_ui + libc.src.stdbit.stdc_first_leading_one_ul + libc.src.stdbit.stdc_first_leading_one_ull + libc.src.stdbit.stdc_first_trailing_zero_uc + libc.src.stdbit.stdc_first_trailing_zero_us + libc.src.stdbit.stdc_first_trailing_zero_ui + libc.src.stdbit.stdc_first_trailing_zero_ul + libc.src.stdbit.stdc_first_trailing_zero_ull + libc.src.stdbit.stdc_first_trailing_one_uc + libc.src.stdbit.stdc_first_trailing_one_us + libc.src.stdbit.stdc_first_trailing_one_ui + libc.src.stdbit.stdc_first_trailing_one_ul + libc.src.stdbit.stdc_first_trailing_one_ull + # stdlib.h entrypoints libc.src.stdlib.abort libc.src.stdlib.abs diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index f725b1c2394c6..2f299e992be09 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -73,6 +73,48 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdio.vsprintf libc.src.stdio.vsnprintf + # stdbit.h entrypoints + libc.src.stdbit.stdc_leading_zeros_uc + libc.src.stdbit.stdc_leading_zeros_us + libc.src.stdbit.stdc_leading_zeros_ui + libc.src.stdbit.stdc_leading_zeros_ul + libc.src.stdbit.stdc_leading_zeros_ull + libc.src.stdbit.stdc_leading_ones_uc + libc.src.stdbit.stdc_leading_ones_us + libc.src.stdbit.stdc_leading_ones_ui + libc.src.stdbit.stdc_leading_ones_ul + libc.src.stdbit.stdc_leading_ones_ull + libc.src.stdbit.stdc_trailing_zeros_uc + libc.src.stdbit.stdc_trailing_zeros_us + libc.src.stdbit.stdc_trailing_zeros_ui + libc.src.stdbit.stdc_trailing_zeros_ul + libc.src.stdbit.stdc_trailing_zeros_ull + libc.src.stdbit.stdc_trailing_ones_uc + libc.src.stdbit.stdc_trailing_ones_us + libc.src.stdbit.stdc_trailing_ones_ui + libc.src.stdbit.stdc_trailing_ones_ul + libc.src.stdbit.stdc_trailing_ones_ull + libc.src.stdbit.stdc_first_leading_zero_uc + libc.src.stdbit.stdc_first_leading_zero_us + libc.src.stdbit.stdc_first_leading_zero_ui + libc.src.stdbit.stdc_first_leading_zero_ul + libc.src.stdbit.stdc_first_leading_zero_ull + libc.src.stdbit.stdc_first_leading_one_uc + libc.src.stdbit.stdc_first_leading_one_us + libc.src.stdbit.stdc_first_leading_one_ui + libc.src.stdbit.stdc_first_leading_one_ul + libc.src.stdbit.stdc_first_leading_one_ull + libc.src.stdbit.stdc_first_trailing_zero_uc + libc.src.stdbit.stdc_first_trailing_zero_us + libc.src.stdbit.stdc_first_trailing_zero_ui + libc.src.stdbit.stdc_first_trailing_zero_ul + libc.src.stdbit.stdc_first_trailing_zero_ull + libc.src.stdbit.stdc_first_trailing_one_uc + libc.src.stdbit.stdc_first_trailing_one_us + libc.src.stdbit.stdc_first_trailing_one_ui + libc.src.stdbit.stdc_first_trailing_one_ul + libc.src.stdbit.stdc_first_trailing_one_ull + # stdlib.h entrypoints libc.src.stdlib.abort libc.src.stdlib.abs diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index b333c6be14462..5224e92bbcc58 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -65,6 +65,48 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.string.strtok_r libc.src.string.strxfrm + # stdbit.h entrypoints + libc.src.stdbit.stdc_leading_zeros_uc + libc.src.stdbit.stdc_leading_zeros_us + libc.src.stdbit.stdc_leading_zeros_ui + libc.src.stdbit.stdc_leading_zeros_ul + libc.src.stdbit.stdc_leading_zeros_ull + libc.src.stdbit.stdc_leading_ones_uc + libc.src.stdbit.stdc_leading_ones_us + libc.src.stdbit.stdc_leading_ones_ui + libc.src.stdbit.stdc_leading_ones_ul + libc.src.stdbit.stdc_leading_ones_ull + libc.src.stdbit.stdc_trailing_zeros_uc + libc.src.stdbit.stdc_trailing_zeros_us + libc.src.stdbit.stdc_trailing_zeros_ui + libc.src.stdbit.stdc_trailing_zeros_ul + libc.src.stdbit.stdc_trailing_zeros_ull + libc.src.stdbit.stdc_trailing_ones_uc + libc.src.stdbit.stdc_trailing_ones_us + libc.src.stdbit.stdc_trailing_ones_ui + libc.src.stdbit.stdc_trailing_ones_ul + libc.src.stdbit.stdc_trailing_ones_ull + libc.src.stdbit.stdc_first_leading_zero_uc + libc.src.stdbit.stdc_first_leading_zero_us + libc.src.stdbit.stdc_first_leading_zero_ui + libc.src.stdbit.stdc_first_leading_zero_ul + libc.src.stdbit.stdc_first_leading_zero_ull + libc.src.stdbit.stdc_first_leading_one_uc + libc.src.stdbit.stdc_first_leading_one_us + libc.src.stdbit.stdc_first_leading_one_ui + libc.src.stdbit.stdc_first_leading_one_ul + libc.src.stdbit.stdc_first_leading_one_ull + libc.src.stdbit.stdc_first_trailing_zero_uc + libc.src.stdbit.stdc_first_trailing_zero_us + libc.src.stdbit.stdc_first_trailing_zero_ui + libc.src.stdbit.stdc_first_trailing_zero_ul + libc.src.stdbit.stdc_first_trailing_zero_ull + libc.src.stdbit.stdc_first_trailing_one_uc + libc.src.stdbit.stdc_first_trailing_one_us + libc.src.stdbit.stdc_first_trailing_one_ui + libc.src.stdbit.stdc_first_trailing_one_ul + libc.src.stdbit.stdc_first_trailing_one_ull + # stdlib.h entrypoints libc.src.stdlib.abs libc.src.stdlib.atoi diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 6e194682df4bf..8a6c160c09932 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -95,6 +95,41 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdbit.stdc_leading_zeros_ui libc.src.stdbit.stdc_leading_zeros_ul libc.src.stdbit.stdc_leading_zeros_ull + libc.src.stdbit.stdc_leading_ones_uc + libc.src.stdbit.stdc_leading_ones_us + libc.src.stdbit.stdc_leading_ones_ui + libc.src.stdbit.stdc_leading_ones_ul + libc.src.stdbit.stdc_leading_ones_ull + libc.src.stdbit.stdc_trailing_zeros_uc + libc.src.stdbit.stdc_trailing_zeros_us + libc.src.stdbit.stdc_trailing_zeros_ui + libc.src.stdbit.stdc_trailing_zeros_ul + libc.src.stdbit.stdc_trailing_zeros_ull + libc.src.stdbit.stdc_trailing_ones_uc + libc.src.stdbit.stdc_trailing_ones_us + libc.src.stdbit.stdc_trailing_ones_ui + libc.src.stdbit.stdc_trailing_ones_ul + libc.src.stdbit.stdc_trailing_ones_ull + libc.src.stdbit.stdc_first_leading_zero_uc + libc.src.stdbit.stdc_first_leading_zero_us + libc.src.stdbit.stdc_first_leading_zero_ui + libc.src.stdbit.stdc_first_leading_zero_ul + libc.src.stdbit.stdc_first_leading_zero_ull + libc.src.stdbit.stdc_first_leading_one_uc + libc.src.stdbit.stdc_first_leading_one_us + libc.src.stdbit.stdc_first_leading_one_ui + libc.src.stdbit.stdc_first_leading_one_ul + libc.src.stdbit.stdc_first_leading_one_ull + libc.src.stdbit.stdc_first_trailing_zero_uc + libc.src.stdbit.stdc_first_trailing_zero_us + libc.src.stdbit.stdc_first_trailing_zero_ui + libc.src.stdbit.stdc_first_trailing_zero_ul + libc.src.stdbit.stdc_first_trailing_zero_ull + libc.src.stdbit.stdc_first_trailing_one_uc + libc.src.stdbit.stdc_first_trailing_one_us + libc.src.stdbit.stdc_first_trailing_one_ui + libc.src.stdbit.stdc_first_trailing_one_ul + libc.src.stdbit.stdc_first_trailing_one_ull # stdlib.h entrypoints libc.src.stdlib.abs diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt index 9bacfab7b0e5a..7df1904908886 100644 --- a/libc/config/linux/arm/entrypoints.txt +++ b/libc/config/linux/arm/entrypoints.txt @@ -73,6 +73,41 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdbit.stdc_leading_zeros_ui libc.src.stdbit.stdc_leading_zeros_ul libc.src.stdbit.stdc_leading_zeros_ull + libc.src.stdbit.stdc_leading_ones_uc + libc.src.stdbit.stdc_leading_ones_us + libc.src.stdbit.stdc_leading_ones_ui + libc.src.stdbit.stdc_leading_ones_ul + libc.src.stdbit.stdc_leading_ones_ull + libc.src.stdbit.stdc_trailing_zeros_uc + libc.src.stdbit.stdc_trailing_zeros_us + libc.src.stdbit.stdc_trailing_zeros_ui + libc.src.stdbit.stdc_trailing_zeros_ul + libc.src.stdbit.stdc_trailing_zeros_ull + libc.src.stdbit.stdc_trailing_ones_uc + libc.src.stdbit.stdc_trailing_ones_us + libc.src.stdbit.stdc_trailing_ones_ui + libc.src.stdbit.stdc_trailing_ones_ul + libc.src.stdbit.stdc_trailing_ones_ull + libc.src.stdbit.stdc_first_leading_zero_uc + libc.src.stdbit.stdc_first_leading_zero_us + libc.src.stdbit.stdc_first_leading_zero_ui + libc.src.stdbit.stdc_first_leading_zero_ul + libc.src.stdbit.stdc_first_leading_zero_ull + libc.src.stdbit.stdc_first_leading_one_uc + libc.src.stdbit.stdc_first_leading_one_us + libc.src.stdbit.stdc_first_leading_one_ui + libc.src.stdbit.stdc_first_leading_one_ul + libc.src.stdbit.stdc_first_leading_one_ull + libc.src.stdbit.stdc_first_trailing_zero_uc + libc.src.stdbit.stdc_first_trailing_zero_us + libc.src.stdbit.stdc_first_trailing_zero_ui + libc.src.stdbit.stdc_first_trailing_zero_ul + libc.src.stdbit.stdc_first_trailing_zero_ull + libc.src.stdbit.stdc_first_trailing_one_uc + libc.src.stdbit.stdc_first_trailing_one_us + libc.src.stdbit.stdc_first_trailing_one_ui + libc.src.stdbit.stdc_first_trailing_one_ul + libc.src.stdbit.stdc_first_trailing_one_ull # stdlib.h entrypoints libc.src.stdlib.abs diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 71ff4bcfc3519..5c8cc7618a9e8 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -97,6 +97,41 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdbit.stdc_leading_zeros_ui libc.src.stdbit.stdc_leading_zeros_ul libc.src.stdbit.stdc_leading_zeros_ull + libc.src.stdbit.stdc_leading_ones_uc + libc.src.stdbit.stdc_leading_ones_us + libc.src.stdbit.stdc_leading_ones_ui + libc.src.stdbit.stdc_leading_ones_ul + libc.src.stdbit.stdc_leading_ones_ull + libc.src.stdbit.stdc_trailing_zeros_uc + libc.src.stdbit.stdc_trailing_zeros_us + libc.src.stdbit.stdc_trailing_zeros_ui + libc.src.stdbit.stdc_trailing_zeros_ul + libc.src.stdbit.stdc_trailing_zeros_ull + libc.src.stdbit.stdc_trailing_ones_uc + libc.src.stdbit.stdc_trailing_ones_us + libc.src.stdbit.stdc_trailing_ones_ui + libc.src.stdbit.stdc_trailing_ones_ul + libc.src.stdbit.stdc_trailing_ones_ull + libc.src.stdbit.stdc_first_leading_zero_uc + libc.src.stdbit.stdc_first_leading_zero_us + libc.src.stdbit.stdc_first_leading_zero_ui + libc.src.stdbit.stdc_first_leading_zero_ul + libc.src.stdbit.stdc_first_leading_zero_ull + libc.src.stdbit.stdc_first_leading_one_uc + libc.src.stdbit.stdc_first_leading_one_us + libc.src.stdbit.stdc_first_leading_one_ui + libc.src.stdbit.stdc_first_leading_one_ul + libc.src.stdbit.stdc_first_leading_one_ull + libc.src.stdbit.stdc_first_trailing_zero_uc + libc.src.stdbit.stdc_first_trailing_zero_us + libc.src.stdbit.stdc_first_trailing_zero_ui + libc.src.stdbit.stdc_first_trailing_zero_ul + libc.src.stdbit.stdc_first_trailing_zero_ull + libc.src.stdbit.stdc_first_trailing_one_uc + libc.src.stdbit.stdc_first_trailing_one_us + libc.src.stdbit.stdc_first_trailing_one_ui + libc.src.stdbit.stdc_first_trailing_one_ul + libc.src.stdbit.stdc_first_trailing_one_ull # stdlib.h entrypoints libc.src.stdlib.abs From ff4d6c64ee4269e4a9b67a4dae7e0b82ae1c3419 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 21 Feb 2024 00:14:59 +0000 Subject: [PATCH 030/351] Fix llvm-x86_64-debian-dylib buildbot This was broken by 91a384621e5b762d9c173ffd247cfeadd5f436a2. --- llvm/test/lit.cfg.py | 7 ++++--- llvm/test/lit.site.cfg.py.in | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 8ecae5dbe3720..4c05317036d1a 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -415,10 +415,11 @@ def version_int(ver): config.available_features.add("llvm-dylib") config.substitutions.append( ( + # libLLVM.so.19.0git "%llvmdylib", - "{}/libLLVM-{}{}".format( - config.llvm_shlib_dir, config.llvm_dylib_version, config.llvm_shlib_ext - ), + "{}/libLLVM{}.{}".format( + config.llvm_shlib_dir, config.llvm_shlib_ext, config.llvm_dylib_version + ) ) ) diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 1138b2ccf7bce..b6f255d472d16 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -44,7 +44,7 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@ config.enable_threads = @LLVM_ENABLE_THREADS@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ -config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@@LLVM_VERSION_SUFFIX@" +config.llvm_dylib_version = "@LLVM_VERSION_MAJOR@.@LLVM_VERSION_MINOR@@LLVM_VERSION_SUFFIX@" config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' config.host_arch = "@HOST_ARCH@" config.have_opt_viewer_modules = @LLVM_HAVE_OPT_VIEWER_MODULES@ From 98db8d0cb78e9dd3f78427d519ae8dd175b70b03 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 20 Feb 2024 16:34:40 -0800 Subject: [PATCH 031/351] [AMDGPU] Fix v_dot2_f16_f16/v_dot2_bf16_bf16 operands (#82423) src0 and src1 are packed f16/bf16, we are printing literals like 0x40002000, but we cannot parse it. --- .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 3 +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 14 ++++++++------ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 13 +------------ llvm/test/MC/AMDGPU/gfx11_asm_vop3.s | 12 ++++++++++++ .../MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt | 12 ++++++++++++ 5 files changed, 36 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 85bd33e4efbd0..5b32b34079f44 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -323,6 +323,9 @@ class AMDGPUOperand : public MCParsedAsmOperand { return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f32); } + bool isPackedFP16InputMods() const { + return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::v2f16); + } bool isVReg() const { return isRegClass(AMDGPU::VGPR_32RegClassID) || diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index cd14c12a8a80c..97c723752b70b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1289,9 +1289,8 @@ def IntVRegInputMods : InputMods { class PackedFPInputModsMatchClass : AsmOperandClass { let Name = "PackedFP"#opSize#"InputMods"; - let ParserMethod = "parseRegOrImm"; - let PredicateMethod = "isRegOrImm"; -// let PredicateMethod = "isPackedFP"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImmWithFPInputMods"; + let PredicateMethod = "isPackedFP"#opSize#"InputMods"; } class PackedIntInputModsMatchClass : AsmOperandClass { @@ -1305,7 +1304,7 @@ def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>; def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>; class PackedFPInputMods : InputMods { -// let PrintMethod = "printPackedFPInputMods"; + let PrintMethod = "printOperandAndFPInputMods"; } class PackedIntInputMods : InputMods { @@ -1606,8 +1605,11 @@ class getSrcMod { } class getOpSelMod { - Operand ret = !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), - FP16InputMods, IntOpSelMods); + Operand ret = !cond(!eq(VT, f16) : FP16InputMods, + !eq(VT, bf16) : FP16InputMods, + !eq(VT, v2f16) : PackedF16InputMods, + !eq(VT, v2bf16) : PackedF16InputMods, + 1 : IntOpSelMods); } // Return type of input modifiers operand specified input operand for DPP diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index e7b8a7b889f0f..396ae9c9d92ee 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -868,20 +868,9 @@ def : DivFmasPat; def : DivFmasPat; } -class VOP3_DOT_Profile : VOP3_Profile { +class VOP3_DOT_Profile : VOP3_Profile { let HasClamp = 0; let HasOMod = 0; - // Override modifiers for bf16(i16) (same as float modifiers). - let HasSrc0Mods = 1; - let HasSrc1Mods = 1; - let HasSrc2Mods = 1; - let Src0ModVOP3DPP = FPVRegInputMods; - let Src1ModVOP3DPP = FPVRegInputMods; - let Src2ModVOP3DPP = FP16InputMods; - let InsVOP3OpSel = getInsVOP3OpSel.ret; - let AsmVOP3OpSel = getAsmVOP3OpSel.ret; } let SubtargetPredicate = isGFX11Plus in { diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s index 9a94162005e1f..d288c02a22c92 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s @@ -2116,6 +2116,12 @@ v_dot2_bf16_bf16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] v_dot2_bf16_bf16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] // GFX11: encoding: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 +// GFX11: v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20] + +v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 +// GFX11: v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20] + v_dot2_f16_f16 v5, v1, v2, s3 // GFX11: encoding: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00] @@ -2161,6 +2167,12 @@ v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] v_dot2_f16_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] // GFX11: encoding: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] +v_dot2_f16_f16 v2, v0, 0x20004000, v2 +// GFX11: v_dot2_f16_f16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20] + +v_dot2_f16_f16 v2, 0x20004000, v0, v2 +// GFX11: v_dot2_f16_f16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20] + v_fma_dx9_zero_f32 v5, v1, v2, s3 // GFX11: encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt index 7674c02185b5f..fc35a2e6b4f8f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt @@ -1788,6 +1788,12 @@ # GFX11: v_dot2_bf16_bf16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] ; encoding: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] 0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 +# GFX11: v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20] +0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20 + +# GFX11: v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20] +0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20 + # GFX11: v_dot2_f16_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00 @@ -1833,6 +1839,12 @@ # GFX11: v_dot2_f16_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] ; encoding: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00] 0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00 +# GFX11: v_dot2_f16_f16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20] +0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20 + +# GFX11: v_dot2_f16_f16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20] +0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20 + # GFX11: v_fma_dx9_zero_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00 From f78027dfeca9925efe7e025beb05b4cef8a1581a Mon Sep 17 00:00:00 2001 From: Boian Petkantchin Date: Tue, 20 Feb 2024 16:53:26 -0800 Subject: [PATCH 032/351] [mlir][mesh] Better op result names (#82408) Implement OpAsmOpInterface for most ops to increase IR readability. For example `mesh.process_linear_index` would produce a value with name `proc_linear_idx`. --- mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td | 25 ++++-- mlir/lib/Dialect/Mesh/IR/MeshOps.cpp | 84 ++++++++++++++++++- .../Mesh/process-multi-index-op-lowering.mlir | 4 +- 3 files changed, 103 insertions(+), 10 deletions(-) diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td index 8ba7c111aea6b..b9cd15e206266 100644 --- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td +++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td @@ -16,6 +16,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/BuiltinTypes.td" include "mlir/IR/CommonAttrConstraints.td" include "mlir/IR/CommonTypeConstraints.td" +include "mlir/IR/OpAsmInterface.td" include "mlir/IR/SymbolInterfaces.td" //===----------------------------------------------------------------------===// @@ -78,7 +79,10 @@ def Mesh_MeshOp : Mesh_Op<"mesh", [Symbol]> { } def Mesh_MeshShapeOp : Mesh_Op<"mesh_shape", [ - Pure, DeclareOpInterfaceMethods]> { + Pure, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ]> { let summary = "Get the shape of the mesh."; let arguments = (ins FlatSymbolRefAttr:$mesh, @@ -101,7 +105,11 @@ def Mesh_MeshShapeOp : Mesh_Op<"mesh_shape", [ ]; } -def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> { +def Mesh_ShardOp : Mesh_Op<"shard", [ + Pure, + SameOperandsAndResultType, + DeclareOpInterfaceMethods + ]> { let summary = "Annotate on how a tensor is sharded across a mesh."; let description = [{ The mesh.shard operation is designed to specify and guide the sharding @@ -194,7 +202,8 @@ def Mesh_ShardOp : Mesh_Op<"shard", [Pure, SameOperandsAndResultType]> { def Mesh_ProcessMultiIndexOp : Mesh_Op<"process_multi_index", [ Pure, - DeclareOpInterfaceMethods + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods ]> { let summary = "Get the multi index of current device along specified mesh axes."; let description = [{ @@ -221,7 +230,8 @@ def Mesh_ProcessMultiIndexOp : Mesh_Op<"process_multi_index", [ def Mesh_ProcessLinearIndexOp : Mesh_Op<"process_linear_index", [ Pure, - DeclareOpInterfaceMethods + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods ]> { let summary = "Get the linear index of the current device."; let description = [{ @@ -248,7 +258,10 @@ class Mesh_CollectiveCommunicationOpBase< string mnemonic, list traits = []> : Mesh_Op])> { + [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ])> { dag commonArgs = (ins FlatSymbolRefAttr:$mesh, DefaultValuedAttr:$mesh_axes @@ -258,7 +271,7 @@ class Mesh_CollectiveCommunicationOpBase< def Mesh_AllGatherOp : Mesh_CollectiveCommunicationOpBase<"all_gather", [ Pure, SameOperandsAndResultElementType, - SameOperandsAndResultRank + SameOperandsAndResultRank, ]> { let summary = "All-gather over a device mesh."; let description = [{ diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp index 838255cf5a5ba..50163880e85f9 100644 --- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp +++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp @@ -24,7 +24,6 @@ #include "mlir/Support/LLVM.h" #include "mlir/Support/LogicalResult.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -34,7 +33,6 @@ #include #include #include -#include #include #define DEBUG_TYPE "mesh-ops" @@ -244,6 +242,11 @@ void MeshShapeOp::build(OpBuilder &odsBuilder, OperationState &odsState, MeshAxesAttr::get(odsBuilder.getContext(), axes)); } +void MeshShapeOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResults()[0], "mesh_shape"); +} + //===----------------------------------------------------------------------===// // mesh.shard attr //===----------------------------------------------------------------------===// @@ -307,6 +310,15 @@ bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const { std::mem_fn(&MeshAxesAttr::empty)); } +//===----------------------------------------------------------------------===// +// mesh.shard op +//===----------------------------------------------------------------------===// + +void ShardOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "sharding_annotated"); +} + //===----------------------------------------------------------------------===// // mesh.process_multi_index op //===----------------------------------------------------------------------===// @@ -345,6 +357,11 @@ void ProcessMultiIndexOp::build(OpBuilder &odsBuilder, OperationState &odsState, MeshAxesAttr::get(odsBuilder.getContext(), axes)); } +void ProcessMultiIndexOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResults()[0], "proc_linear_idx"); +} + //===----------------------------------------------------------------------===// // mesh.process_linear_index op //===----------------------------------------------------------------------===// @@ -363,6 +380,11 @@ void ProcessLinearIndexOp::build(OpBuilder &odsBuilder, build(odsBuilder, odsState, mesh.getSymName()); } +void ProcessLinearIndexOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "proc_linear_idx"); +} + //===----------------------------------------------------------------------===// // collective communication ops //===----------------------------------------------------------------------===// @@ -606,6 +628,11 @@ void AllGatherOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void AllGatherOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "all_gather"); +} + //===----------------------------------------------------------------------===// // mesh.all_reduce op //===----------------------------------------------------------------------===// @@ -620,6 +647,11 @@ void AllReduceOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void AllReduceOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "all_reduce"); +} + //===----------------------------------------------------------------------===// // mesh.all_slice op //===----------------------------------------------------------------------===// @@ -654,6 +686,11 @@ void AllSliceOp::build(OpBuilder &odsBuilder, OperationState &odsState, APInt(sizeof(sliceAxis) * CHAR_BIT, sliceAxis)); } +void AllSliceOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "all_slice"); +} + //===----------------------------------------------------------------------===// // mesh.all_to_all op //===----------------------------------------------------------------------===// @@ -674,6 +711,11 @@ void AllToAllOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void AllToAllOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "all_to_all"); +} + //===----------------------------------------------------------------------===// // mesh.broadcast op //===----------------------------------------------------------------------===// @@ -698,6 +740,11 @@ void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void BroadcastOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "broadcast"); +} + //===----------------------------------------------------------------------===// // mesh.gather op //===----------------------------------------------------------------------===// @@ -724,6 +771,11 @@ void GatherOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void GatherOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "gather"); +} + //===----------------------------------------------------------------------===// // mesh.recv op //===----------------------------------------------------------------------===// @@ -747,6 +799,10 @@ void RecvOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void RecvOp::getAsmResultNames(function_ref setNameFn) { + setNameFn(getResult(), "recv"); +} + //===----------------------------------------------------------------------===// // mesh.reduce op //===----------------------------------------------------------------------===// @@ -770,6 +826,11 @@ void ReduceOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void ReduceOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "reduce"); +} + //===----------------------------------------------------------------------===// // mesh.reduce_scatter op //===----------------------------------------------------------------------===// @@ -791,6 +852,11 @@ void ReduceScatterOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void ReduceScatterOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "reduce_scatter"); +} + //===----------------------------------------------------------------------===// // mesh.scatter op //===----------------------------------------------------------------------===// @@ -817,6 +883,11 @@ void ScatterOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void ScatterOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "scatter"); +} + //===----------------------------------------------------------------------===// // mesh.send op //===----------------------------------------------------------------------===// @@ -839,6 +910,10 @@ void SendOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add>(context); } +void SendOp::getAsmResultNames(function_ref setNameFn) { + setNameFn(getResult(), "send"); +} + //===----------------------------------------------------------------------===// // mesh.shift op //===----------------------------------------------------------------------===// @@ -865,6 +940,11 @@ void ShiftOp::getCanonicalizationPatterns(RewritePatternSet &patterns, // offset % shift_axis_mesh_dim_size == 0. } +void ShiftOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "shift"); +} + //===----------------------------------------------------------------------===// // TableGen'd op method definitions //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir b/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir index 677a5982ea254..e23cfd79a4274 100644 --- a/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir +++ b/mlir/test/Dialect/Mesh/process-multi-index-op-lowering.mlir @@ -6,7 +6,7 @@ mesh.mesh @mesh2d(shape = ?x?) func.func @multi_index_2d_mesh() -> (index, index) { // CHECK: %[[LINEAR_IDX:.*]] = mesh.process_linear_index on @mesh2d : index // CHECK: %[[MESH_SHAPE:.*]]:2 = mesh.mesh_shape @mesh2d : index, index - // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %0 into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index + // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %[[LINEAR_IDX]] into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index %0:2 = mesh.process_multi_index on @mesh2d : index, index // CHECK: return %[[MULTI_IDX]]#0, %[[MULTI_IDX]]#1 : index, index return %0#0, %0#1 : index, index @@ -16,7 +16,7 @@ func.func @multi_index_2d_mesh() -> (index, index) { func.func @multi_index_2d_mesh_single_inner_axis() -> index { // CHECK: %[[LINEAR_IDX:.*]] = mesh.process_linear_index on @mesh2d : index // CHECK: %[[MESH_SHAPE:.*]]:2 = mesh.mesh_shape @mesh2d : index, index - // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %0 into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index + // CHECK: %[[MULTI_IDX:.*]]:2 = affine.delinearize_index %[[LINEAR_IDX]] into (%[[MESH_SHAPE]]#0, %[[MESH_SHAPE]]#1) : index, index %0 = mesh.process_multi_index on @mesh2d axes = [0] : index // CHECK: return %[[MULTI_IDX]]#0 : index return %0 : index From 4ca0480a4fefe25c2f6e36c04f02998af79274a0 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 20 Feb 2024 16:54:27 -0800 Subject: [PATCH 033/351] [Driver,BareMetal] Replace -lclang_rt.builtins{,-$arch}.a with an absolute path (#82424) The generic `tools::AddRunTimeLibs` uses an absolute path. Change BareMetal to match. I believe users are not supposed to place other files under the directory containing `libclang_rt.builtins-$arch.a`. If they rely on the implicit -L, they now need to explicitly specify -L. --- clang/lib/Driver/ToolChains/BareMetal.cpp | 11 +--- clang/test/Driver/arm-compiler-rt.c | 2 +- clang/test/Driver/baremetal-multilib.yaml | 2 +- clang/test/Driver/baremetal-sysroot.cpp | 2 +- clang/test/Driver/baremetal.cpp | 70 ++++++++--------------- 5 files changed, 27 insertions(+), 60 deletions(-) diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index cd955b6c84945..d5fc1d5dd25a8 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -368,11 +368,7 @@ void BareMetal::AddLinkRuntimeLib(const ArgList &Args, ToolChain::RuntimeLibType RLT = GetRuntimeLibType(Args); switch (RLT) { case ToolChain::RLT_CompilerRT: { - const std::string FileName = getCompilerRT(Args, "builtins"); - llvm::StringRef BaseName = llvm::sys::path::filename(FileName); - BaseName.consume_front("lib"); - BaseName.consume_back(".a"); - CmdArgs.push_back(Args.MakeArgString("-l" + BaseName)); + CmdArgs.push_back(getCompilerRTArgString(Args, "builtins")); return; } case ToolChain::RLT_Libgcc: @@ -462,11 +458,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA, for (const auto &LibPath : TC.getLibraryPaths()) CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath))); - const std::string FileName = TC.getCompilerRT(Args, "builtins"); - llvm::SmallString<128> PathBuf{FileName}; - llvm::sys::path::remove_filename(PathBuf); - CmdArgs.push_back(Args.MakeArgString("-L" + PathBuf)); - if (TC.ShouldLinkCXXStdlib(Args)) TC.AddCXXStdlibLibArgs(Args, CmdArgs); diff --git a/clang/test/Driver/arm-compiler-rt.c b/clang/test/Driver/arm-compiler-rt.c index 954947bb890f8..adecacbcaabf9 100644 --- a/clang/test/Driver/arm-compiler-rt.c +++ b/clang/test/Driver/arm-compiler-rt.c @@ -3,7 +3,7 @@ // RUN: -resource-dir=%S/Inputs/resource_dir_with_arch_subdir \ // RUN: -rtlib=compiler-rt -### %s 2>&1 \ // RUN: | FileCheck %s -check-prefix ARM-EABI -// ARM-EABI: "-lclang_rt.builtins-arm" +// ARM-EABI: "{{[^"]*}}libclang_rt.builtins-arm.a" // RUN: %clang -target arm-linux-gnueabi \ // RUN: --sysroot=%S/Inputs/resource_dir_with_arch_subdir \ diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml index af26e82621c91..3f026cbeb437b 100644 --- a/clang/test/Driver/baremetal-multilib.yaml +++ b/clang/test/Driver/baremetal-multilib.yaml @@ -17,7 +17,7 @@ # CHECK-SAME: "-x" "c++" "{{.*}}baremetal-multilib.yaml" # CHECK-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" # CHECK-SAME: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/lib" -# CHECK-SAME: "-lc" "-lm" "-lclang_rt.builtins" +# CHECK-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" # CHECK-SAME: "-o" "{{.*}}.tmp.out" # RUN: %T/baremetal_multilib/bin/clang -no-canonical-prefixes -x c++ %s -### -o %t.out 2>&1 \ diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp index fc66020772a77..46338185ffd9d 100644 --- a/clang/test/Driver/baremetal-sysroot.cpp +++ b/clang/test/Driver/baremetal-sysroot.cpp @@ -18,5 +18,5 @@ // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp" // CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib" -// CHECK-V6M-C-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" +// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" // CHECK-V6M-C-SAME: "-o" "{{.*}}.o" diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp index 7511d7d1adb4d..8baf388894eb2 100644 --- a/clang/test/Driver/baremetal.cpp +++ b/clang/test/Driver/baremetal.cpp @@ -18,8 +18,7 @@ // CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL" // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for" // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib" -// CHECK-V6M-C-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-V6M-C-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "{{.*}}.tmp.out" +// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "{{.*}}.tmp.out" // RUN: %clang %s -### --target=armv6m-none-eabi -nostdlibinc -nobuiltininc 2>&1 \ // RUN: --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBINC %s @@ -37,16 +36,15 @@ // CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL" // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib" // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi -// CHECK-ARMV7M-PER-TARGET: "-lc" "-lm" "-lclang_rt.builtins" +// CHECK-ARMV7M-PER-TARGET: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \ // RUN: --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-DEFAULTCXX %s // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL" // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib" -// CHECK-V6M-DEFAULTCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-V6M-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "a.out" +// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out" // RUN: %clangxx %s -### --target=armv6m-none-eabi -stdlib=libc++ 2>&1 \ // RUN: --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBCXX %s @@ -55,9 +53,8 @@ // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1" // CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL" // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib" -// CHECK-V6M-LIBCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-V6M-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "a.out" +// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out" // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \ // RUN: --sysroot=%S/Inputs/baremetal_arm \ @@ -68,9 +65,8 @@ // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0" // CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL" // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib" -// CHECK-V6M-LIBSTDCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind" -// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" "--target2=rel" "-o" "a.out" +// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out" // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \ // RUN: --sysroot=%S/Inputs/baremetal_arm \ @@ -79,7 +75,6 @@ // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK-V6M-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL" // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib" -// CHECK-V6M-NDL-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // RUN: rm -rf %T/baremetal_cxx_sysroot // RUN: mkdir -p %T/baremetal_cxx_sysroot/usr/include/c++/v1 @@ -93,9 +88,8 @@ // CHECK-V6M-LIBCXX-USR-SAME: "-internal-isystem" "{{[^"]+}}baremetal_cxx_sysroot{{[/\\]+}}usr{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1" // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic" // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib" -// CHECK-V6M-LIBCXX-USR-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "-lclang_rt.builtins-armv6m" +// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" // RUN: %clangxx --target=arm-none-eabi -v 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-THREAD-MODEL @@ -178,8 +172,7 @@ // CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for" // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib" -// CHECK-RV64-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-RV64-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "{{.*}}.tmp.out" +// CHECK-RV64-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "{{.*}}.tmp.out" // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \ // RUN: --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \ @@ -187,9 +180,8 @@ // CHECK-RV64-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib" -// CHECK-RV64-DEFAULTCXX-SAME: "-L[[RESOURCE_DIR]]{{.*}}{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "a.out" +// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out" // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \ // RUN: --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \ @@ -200,9 +192,8 @@ // CHECK-RV64-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1" // CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib" -// CHECK-RV64-LIBCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "a.out" +// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out" // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \ // RUN: --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \ @@ -213,9 +204,8 @@ // CHECK-RV64-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1" // CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib" -// CHECK-RV64-LIBSTDCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind" -// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv64" "-X" "-o" "a.out" +// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out" // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: -L some/directory/user/asked/for \ @@ -230,8 +220,7 @@ // CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for" // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib" -// CHECK-RV32-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-RV32-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out" +// CHECK-RV32-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out" // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \ @@ -239,9 +228,8 @@ // CHECK-RV32-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib" -// CHECK-RV32-DEFAULTCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out" +// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out" // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \ @@ -252,9 +240,8 @@ // CHECK-RV32-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1" // CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib" -// CHECK-RV32-LIBCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind" -// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out" +// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out" // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \ @@ -265,9 +252,8 @@ // CHECK-RV32-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1" // CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib" -// CHECK-RV32-LIBSTDCXX-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind" -// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "-lclang_rt.builtins-riscv32" "-X" "-o" "a.out" +// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out" // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \ // RUN: -nostdlibinc -nobuiltininc \ @@ -286,7 +272,6 @@ // CHECK-RV64-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" // CHECK-RV64-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV64-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib" -// CHECK-RV64-NDL-SAME: "-L[[RESOURCE_DIR]]{{[/\\]+}}lib{{[/\\]+}}baremetal" // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \ // RUN: -march=rv64imafdc -mabi=lp64d \ @@ -306,7 +291,6 @@ // CHECK-RV64FD-SAME: "-x" "c++" "{{.*}}baremetal.cpp" // CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV64FD-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}lib" -// CHECK-RV64FD-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d" // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: -march=rv32i -mabi=ilp32 \ @@ -326,7 +310,6 @@ // CHECK-RV32I-SAME: "-x" "c++" "{{.*}}baremetal.cpp" // CHECK-RV32I-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32I-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}lib" -// CHECK-RV32I-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32i{{[/\\]+}}ilp32" // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: -march=rv32im -mabi=ilp32 \ @@ -346,7 +329,6 @@ // CHECK-RV32IM-SAME: "-x" "c++" "{{.*}}baremetal.cpp" // CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32IM-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}lib" -// CHECK-RV32IM-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32im{{[/\\]+}}ilp32" // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \ // RUN: -march=rv32iac -mabi=ilp32 \ @@ -361,7 +343,6 @@ // CHECK-RV32IAC-SAME: "-x" "c++" "{{.*}}baremetal.cpp" // CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32IAC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}lib" -// CHECK-RV32IAC-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32iac{{[/\\]+}}ilp32" // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf -march=rv32imafc -mabi=ilp32f \ // RUN: --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \ @@ -383,7 +364,6 @@ // CHECK-RV32IMAFC-SAME: "-x" "c++" "{{.*}}baremetal.cpp" // CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-RV32IMAFC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}lib" -// CHECK-RV32IMAFC-SAME: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}baremetal{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f" // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc-unknown-eabi 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PPCEABI %s @@ -395,8 +375,7 @@ // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include" // CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib" -// CHECK-PPCEABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-PPCEABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpc" "-o" "a.out" +// CHECK-PPCEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc.a" "-o" "a.out" // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64-unknown-eabi 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PPC64EABI %s @@ -408,8 +387,7 @@ // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include" // CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib" -// CHECK-PPC64EABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-PPC64EABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpc64" "-o" "a.out" +// CHECK-PPC64EABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64.a" "-o" "a.out" // RUN: %clang -no-canonical-prefixes %s -### --target=powerpcle-unknown-eabi 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PPCLEEABI %s @@ -421,8 +399,7 @@ // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include" // CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib" -// CHECK-PPCLEEABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpcle" "-o" "a.out" +// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpcle.a" "-o" "a.out" // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64le-unknown-eabi 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PPC64LEEABI %s @@ -434,8 +411,7 @@ // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include" // CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib" -// CHECK-PPC64LEEABI-SAME: "-L[[RESOURCE]]{{[/\\]+}}lib{{[/\\]+}}baremetal" -// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "-lclang_rt.builtins-powerpc64le" "-o" "a.out" +// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64le.a" "-o" "a.out" // Check that compiler-rt library without the arch filename suffix will // be used if present. @@ -446,8 +422,8 @@ // RUN: --target=armv6m-none-eabi \ // RUN: --sysroot=%T/baremetal_clang_rt_noarch \ // RUN: | FileCheck --check-prefix=CHECK-CLANGRT-NOARCH %s -// CHECK-CLANGRT-NOARCH: "-lclang_rt.builtins" -// CHECK-CLANGRT-NOARCH-NOT: "-lclang_rt.builtins-armv6m" +// CHECK-CLANGRT-NOARCH: "{{[^"]*}}libclang_rt.builtins.a" +// CHECK-CLANGRT-NOARCH-NOT: "{{[^"]*}}libclang_rt.builtins-armv6m.a" // Check that compiler-rt library with the arch filename suffix will be // used if present. @@ -458,8 +434,8 @@ // RUN: --target=armv6m-none-eabi \ // RUN: --sysroot=%T/baremetal_clang_rt_arch \ // RUN: | FileCheck --check-prefix=CHECK-CLANGRT-ARCH %s -// CHECK-CLANGRT-ARCH: "-lclang_rt.builtins-armv6m" -// CHECK-CLANGRT-ARCH-NOT: "-lclang_rt.builtins" +// CHECK-CLANGRT-ARCH: "{{[^"]*}}libclang_rt.builtins-armv6m.a" +// CHECK-CLANGRT-ARCH-NOT: "{{[^"]*}}libclang_rt.builtins.a" // Check that "--no-relax" is forwarded to the linker for RISC-V. // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf -nostdinc -mno-relax \ @@ -471,4 +447,4 @@ // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf -nostdinc \ // RUN: --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \ // RUN: | FileCheck --check-prefix=CHECK-RV64-RELAX %s -// CHECK-RV64-RELAX-NOT: "--no-relax" \ No newline at end of file +// CHECK-RV64-RELAX-NOT: "--no-relax" From 5248a9872454065b5e4d44ca2f29329df7c2d28f Mon Sep 17 00:00:00 2001 From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com> Date: Tue, 20 Feb 2024 19:06:21 -0600 Subject: [PATCH 034/351] [mlir][sparse] support SoA COO in codegen path. (#82439) *NOTE*: the `SoA` property only makes a difference on codegen path, and is ignored in libgen path at the moment (only SoA COO is supported). --- .../mlir/Dialect/SparseTensor/IR/Enums.h | 8 ++++- .../SparseTensor/IR/SparseTensorType.h | 5 ++- .../SparseTensor/IR/SparseTensorDialect.cpp | 14 ++++---- .../Transforms/SparseTensorCodegen.cpp | 6 ++-- .../Transforms/SparseTensorRewriting.cpp | 2 +- .../Transforms/Utils/CodegenUtils.cpp | 2 +- .../Utils/SparseTensorDescriptor.cpp | 2 +- .../Transforms/Utils/SparseTensorDescriptor.h | 2 +- .../SparseTensor/CPU/sparse_coo_test.mlir | 32 +++++++++++-------- 9 files changed, 43 insertions(+), 30 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h index 41a14575ed105..a00c9c31256c9 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h @@ -283,7 +283,13 @@ struct LevelType { } bool operator!=(const LevelType lhs) const { return !(*this == lhs); } - LevelType stripProperties() const { return LevelType(lvlBits & ~0xffff); } + LevelType stripStorageIrrelevantProperties() const { + // Properties other than `SoA` do not change the storage scheme of the + // sparse tensor. + constexpr uint64_t mask = + 0xffff & ~static_cast(LevelPropNonDefault::SoA); + return LevelType(lvlBits & ~mask); + } /// Get N of NOutOfM level type. constexpr uint64_t getN() const { diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h index 24a5640d820e4..1a090ddb782fd 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h @@ -24,6 +24,7 @@ struct COOSegment { std::pair lvlRange; // [low, high) bool isSoA; + bool isAoS() const { return !isSoA; } bool isSegmentStart(Level l) const { return l == lvlRange.first; } bool inSegment(Level l) const { return l >= lvlRange.first && l < lvlRange.second; @@ -337,7 +338,9 @@ class SparseTensorType { /// Returns the starting level of this sparse tensor type for a /// trailing COO region that spans **at least** two levels. If /// no such COO region is found, then returns the level-rank. - Level getCOOStart() const; + /// + /// DEPRECATED: use getCOOSegment instead; + Level getAoSCOOStart() const; /// Returns [un]ordered COO type for this sparse tensor type. RankedTensorType getCOOType(bool ordered) const; diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 53e78d2c28b1d..af7b85d458774 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -182,7 +182,7 @@ StorageLayout::getFieldIndexAndStride(SparseTensorFieldKind kind, unsigned stride = 1; if (kind == SparseTensorFieldKind::CrdMemRef) { assert(lvl.has_value()); - const Level cooStart = SparseTensorType(enc).getCOOStart(); + const Level cooStart = SparseTensorType(enc).getAoSCOOStart(); const Level lvlRank = enc.getLvlRank(); if (lvl.value() >= cooStart && lvl.value() < lvlRank) { lvl = cooStart; @@ -811,10 +811,10 @@ bool mlir::sparse_tensor::SparseTensorType::isCOOType(Level startLvl, return !isUnique || isUniqueLvl(lvlRank - 1); } -Level mlir::sparse_tensor::SparseTensorType::getCOOStart() const { +Level mlir::sparse_tensor::SparseTensorType::getAoSCOOStart() const { SmallVector coo = getCOOSegments(); - if (!coo.empty()) { - assert(coo.size() == 1); + assert(coo.size() == 1 || coo.empty()); + if (!coo.empty() && coo.front().isAoS()) { return coo.front().lvlRange.first; } return lvlRank; @@ -1051,7 +1051,7 @@ static SparseTensorEncodingAttr getNormalizedEncodingForSpecifier(SparseTensorEncodingAttr enc) { SmallVector lts; for (auto lt : enc.getLvlTypes()) - lts.push_back(lt.stripProperties()); + lts.push_back(lt.stripStorageIrrelevantProperties()); return SparseTensorEncodingAttr::get( enc.getContext(), lts, @@ -1137,7 +1137,7 @@ static LogicalResult verifyPackUnPack(Operation *op, bool requiresStaticShape, return op->emitError("the sparse-tensor must have an encoding attribute"); // Verifies the trailing COO. - Level cooStartLvl = stt.getCOOStart(); + Level cooStartLvl = stt.getAoSCOOStart(); if (cooStartLvl < stt.getLvlRank()) { // We only supports trailing COO for now, must be the last input. auto cooTp = llvm::cast(lvlTps.back()); @@ -1452,7 +1452,7 @@ LogicalResult ToCoordinatesOp::verify() { LogicalResult ToCoordinatesBufferOp::verify() { auto stt = getSparseTensorType(getTensor()); - if (stt.getCOOStart() >= stt.getLvlRank()) + if (stt.getAoSCOOStart() >= stt.getLvlRank()) return emitError("expected sparse tensor with a COO region"); return success(); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp index d4459c6ea1e52..0ccb11f3a6b85 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp @@ -194,7 +194,7 @@ static void createAllocFields(OpBuilder &builder, Location loc, valHeuristic = builder.create(loc, valHeuristic, lvlSizesValues[lvl]); } else if (sizeHint) { - if (stt.getCOOStart() == 0) { + if (stt.getAoSCOOStart() == 0) { posHeuristic = constantIndex(builder, loc, 2); crdHeuristic = builder.create( loc, constantIndex(builder, loc, lvlRank), sizeHint); // AOS @@ -1316,7 +1316,7 @@ struct SparseAssembleOpConverter : public OpConversionPattern { Value posBack = c0; // index to the last value in the position array Value memSize = c1; // memory size for current array - Level trailCOOStart = stt.getCOOStart(); + Level trailCOOStart = stt.getAoSCOOStart(); Level trailCOORank = stt.getLvlRank() - trailCOOStart; // Sets up SparseTensorSpecifier. for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) { @@ -1453,7 +1453,7 @@ struct SparseNewConverter : public OpConversionPattern { const auto dstTp = getSparseTensorType(op.getResult()); // Creating COO with NewOp is handled by direct IR codegen. All other cases // are handled by rewriting. - if (!dstTp.hasEncoding() || dstTp.getCOOStart() != 0) + if (!dstTp.hasEncoding() || dstTp.getAoSCOOStart() != 0) return failure(); // Implement as follows: diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp index 7326a6a381128..2ccb2361b5efe 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -1180,7 +1180,7 @@ struct NewRewriter : public OpRewritePattern { PatternRewriter &rewriter) const override { Location loc = op.getLoc(); auto stt = getSparseTensorType(op.getResult()); - if (!stt.hasEncoding() || stt.getCOOStart() == 0) + if (!stt.hasEncoding() || stt.getAoSCOOStart() == 0) return failure(); // Implement the NewOp as follows: diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp index 75a4389149187..b888dfadb9c71 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp @@ -568,7 +568,7 @@ Value sparse_tensor::genToCoordinates(OpBuilder &builder, Location loc, const auto srcTp = getSparseTensorType(tensor); const Type crdTp = srcTp.getCrdType(); const Type memTp = - get1DMemRefType(crdTp, /*withLayout=*/lvl >= srcTp.getCOOStart()); + get1DMemRefType(crdTp, /*withLayout=*/lvl >= srcTp.getAoSCOOStart()); return builder.create(loc, memTp, tensor, builder.getIndexAttr(lvl)); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp index 3ab4157475cd4..6ac26ad550f9f 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp @@ -103,7 +103,7 @@ void SparseTensorSpecifier::setSpecifierField(OpBuilder &builder, Location loc, Value sparse_tensor::SparseTensorDescriptor::getCrdMemRefOrView( OpBuilder &builder, Location loc, Level lvl) const { - const Level cooStart = rType.getCOOStart(); + const Level cooStart = rType.getAoSCOOStart(); if (lvl < cooStart) return getMemRefField(SparseTensorFieldKind::CrdMemRef, lvl); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h index 3a61ec7a2236f..c2f631605bf4b 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h @@ -137,7 +137,7 @@ class SparseTensorDescriptorImpl { } Value getAOSMemRef() const { - const Level cooStart = rType.getCOOStart(); + const Level cooStart = rType.getAoSCOOStart(); assert(cooStart < rType.getLvlRank()); return getMemRefField(SparseTensorFieldKind::CrdMemRef, cooStart); } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir index aaf15ecc681fc..16252c1005ebb 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_coo_test.mlir @@ -34,6 +34,10 @@ map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton) }> +#SortedCOOSoA = #sparse_tensor.encoding<{ + map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton(soa)) +}> + #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }> @@ -50,7 +54,7 @@ module { func.func @add_coo_csr(%arga: tensor<8x8xf32, #CSR>, - %argb: tensor<8x8xf32, #SortedCOO>) + %argb: tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32> { %empty = tensor.empty() : tensor<8x8xf32> %zero = arith.constant 0.000000e+00 : f32 @@ -59,7 +63,7 @@ module { outs(%empty : tensor<8x8xf32>) -> tensor<8x8xf32> %0 = linalg.generic #trait ins(%arga, %argb: tensor<8x8xf32, #CSR>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) outs(%init: tensor<8x8xf32>) { ^bb(%a: f32, %b: f32, %x: f32): %0 = arith.addf %a, %b : f32 @@ -69,7 +73,7 @@ module { } func.func @add_coo_coo(%arga: tensor<8x8xf32, #SortedCOO>, - %argb: tensor<8x8xf32, #SortedCOO>) + %argb: tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32> { %empty = tensor.empty() : tensor<8x8xf32> %zero = arith.constant 0.000000e+00 : f32 @@ -78,7 +82,7 @@ module { outs(%empty : tensor<8x8xf32>) -> tensor<8x8xf32> %0 = linalg.generic #trait ins(%arga, %argb: tensor<8x8xf32, #SortedCOO>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) outs(%init: tensor<8x8xf32>) { ^bb(%a: f32, %b: f32, %x: f32): %0 = arith.addf %a, %b : f32 @@ -88,12 +92,12 @@ module { } func.func @add_coo_coo_out_coo(%arga: tensor<8x8xf32, #SortedCOO>, - %argb: tensor<8x8xf32, #SortedCOO>) + %argb: tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32, #SortedCOO> { %init = tensor.empty() : tensor<8x8xf32, #SortedCOO> %0 = linalg.generic #trait ins(%arga, %argb: tensor<8x8xf32, #SortedCOO>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) outs(%init: tensor<8x8xf32, #SortedCOO>) { ^bb(%a: f32, %b: f32, %x: f32): %0 = arith.addf %a, %b : f32 @@ -104,7 +108,7 @@ module { func.func @add_coo_dense(%arga: tensor<8x8xf32>, - %argb: tensor<8x8xf32, #SortedCOO>) + %argb: tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32> { %empty = tensor.empty() : tensor<8x8xf32> %zero = arith.constant 0.000000e+00 : f32 @@ -113,7 +117,7 @@ module { outs(%empty : tensor<8x8xf32>) -> tensor<8x8xf32> %0 = linalg.generic #trait ins(%arga, %argb: tensor<8x8xf32>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) outs(%init: tensor<8x8xf32>) { ^bb(%a: f32, %b: f32, %x: f32): %0 = arith.addf %a, %b : f32 @@ -154,19 +158,19 @@ module { %COO_A = sparse_tensor.convert %A : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO> %COO_B = sparse_tensor.convert %B - : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOO> + : tensor<8x8xf32> to tensor<8x8xf32, #SortedCOOSoA> %C1 = call @add_coo_dense(%A, %COO_B) : (tensor<8x8xf32>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32> %C2 = call @add_coo_csr(%CSR_A, %COO_B) : (tensor<8x8xf32, #CSR>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32> %C3 = call @add_coo_coo(%COO_A, %COO_B) : (tensor<8x8xf32, #SortedCOO>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32> %COO_RET = call @add_coo_coo_out_coo(%COO_A, %COO_B) : (tensor<8x8xf32, #SortedCOO>, - tensor<8x8xf32, #SortedCOO>) + tensor<8x8xf32, #SortedCOOSoA>) -> tensor<8x8xf32, #SortedCOO> %C4 = sparse_tensor.convert %COO_RET : tensor<8x8xf32, #SortedCOO> to tensor<8x8xf32> // @@ -204,7 +208,7 @@ module { bufferization.dealloc_tensor %C4 : tensor<8x8xf32> bufferization.dealloc_tensor %CSR_A : tensor<8x8xf32, #CSR> bufferization.dealloc_tensor %COO_A : tensor<8x8xf32, #SortedCOO> - bufferization.dealloc_tensor %COO_B : tensor<8x8xf32, #SortedCOO> + bufferization.dealloc_tensor %COO_B : tensor<8x8xf32, #SortedCOOSoA> bufferization.dealloc_tensor %COO_RET : tensor<8x8xf32, #SortedCOO> From 7c071c23ffe934d863f3a1863d77b41f7e4d2b51 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 20 Feb 2024 20:09:21 -0500 Subject: [PATCH 035/351] [gn] port ff4d6c64ee42 --- llvm/utils/gn/secondary/llvm/test/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index ab4fd8e6403e7..3257f4b5ff236 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -74,6 +74,7 @@ write_lit_config("lit_site_cfg") { "LLVM_USE_INTEL_JITEVENTS=0", "LLVM_USE_SANITIZER=", "LLVM_VERSION_MAJOR=$llvm_version_major", + "LLVM_VERSION_MINOR=$llvm_version_minor", "LLVM_VERSION_SUFFIX=git", "Python3_EXECUTABLE=$python_path", "TARGETS_TO_BUILD=$llvm_targets_to_build_string", From 031f9f331723e6bebc405ffdee4b8a87a5fc0472 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Tue, 20 Feb 2024 17:12:24 -0800 Subject: [PATCH 036/351] [alpha.webkit.UncountedCallArgsChecker] Ignore calls to WTF's container methods (#82156) This PR makes the checker ignore / skip calls to methods of Web Template Platform's container types such as HashMap, HashSet, WeakHashSet, WeakHashMap, Vector, etc... --- .../WebKit/UncountedCallArgsChecker.cpp | 28 ++++ .../WebKit/call-args-wtf-containers.cpp | 146 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp index 8d344f9b63961..8b41a949fd673 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp @@ -170,6 +170,9 @@ class UncountedCallArgsChecker if (!Callee) return false; + if (isMethodOnWTFContainerType(Callee)) + return true; + auto overloadedOperatorType = Callee->getOverloadedOperator(); if (overloadedOperatorType == OO_EqualEqual || overloadedOperatorType == OO_ExclaimEqual || @@ -198,6 +201,31 @@ class UncountedCallArgsChecker return false; } + bool isMethodOnWTFContainerType(const FunctionDecl *Decl) const { + if (!isa(Decl)) + return false; + auto *ClassDecl = Decl->getParent(); + if (!ClassDecl || !isa(ClassDecl)) + return false; + + auto *NsDecl = ClassDecl->getParent(); + if (!NsDecl || !isa(NsDecl)) + return false; + + auto MethodName = safeGetName(Decl); + auto ClsNameStr = safeGetName(ClassDecl); + StringRef ClsName = ClsNameStr; // FIXME: Make safeGetName return StringRef. + auto NamespaceName = safeGetName(NsDecl); + // FIXME: These should be implemented via attributes. + return NamespaceName == "WTF" && + (MethodName == "find" || MethodName == "findIf" || + MethodName == "reverseFind" || MethodName == "reverseFindIf" || + MethodName == "get" || MethodName == "inlineGet" || + MethodName == "contains" || MethodName == "containsIf") && + (ClsName.ends_with("Vector") || ClsName.ends_with("Set") || + ClsName.ends_with("Map")); + } + void reportBug(const Expr *CallArg, const ParmVarDecl *Param) const { assert(CallArg); diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp new file mode 100644 index 0000000000000..0a63a78985612 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/call-args-wtf-containers.cpp @@ -0,0 +1,146 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s + +#include "mock-types.h" + +namespace WTF { + + template + class HashSet { + public: + template T* find(U&) const; + template bool contains(U&) const; + unsigned size() { return m_size; } + template void add(U&) const; + template void remove(U&) const; + + private: + T* m_table { nullptr }; + unsigned m_size { 0 }; + }; + + template + class HashMap { + public: + struct Item { + T key; + S value; + }; + + template Item* find(U&) const; + template bool contains(U&) const; + template S* get(U&) const; + template S* inlineGet(U&) const; + template void add(U&) const; + template void remove(U&) const; + + private: + Item* m_table { nullptr }; + }; + + template + class WeakHashSet { + public: + template T* find(U&) const; + template bool contains(U&) const; + template void add(U&) const; + template void remove(U&) const; + }; + + template + class Vector { + public: + unsigned size() { return m_size; } + T& at(unsigned i) { return m_buffer[i]; } + T& operator[](unsigned i) { return m_buffer[i]; } + template unsigned find(U&); + template unsigned reverseFind(U&); + template bool contains(U&); + template unsigned findIf(const MatchFunction& match) + { + for (unsigned i = 0; i < m_size; ++i) { + if (match(at(i))) + return i; + } + return static_cast(-1); + } + template unsigned reverseFindIf(const MatchFunction& match) + { + for (unsigned i = 0; i < m_size; ++i) { + if (match(at(m_size - i))) + return i; + } + return static_cast(-1); + } + template bool containsIf(const MatchFunction& match) + { + for (unsigned i = 0; i < m_size; ++i) { + if (match(at(m_size - i))) + return true; + } + return false; + } + template void append(U&) const; + template void remove(U&) const; + + private: + T* m_buffer { nullptr }; + unsigned m_size { 0 }; + }; + +} + +using WTF::HashSet; +using WTF::HashMap; +using WTF::WeakHashSet; +using WTF::Vector; + +class RefCounted { +public: + void ref() const; + void deref() const; +}; + +RefCounted* object(); + +void test() { + HashSet> set; + set.find(*object()); + set.contains(*object()); + set.add(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + set.remove(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + + HashMap, unsigned> map; + map.find(*object()); + map.contains(*object()); + map.inlineGet(*object()); + map.add(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + map.remove(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + + WeakHashSet> weakSet; + weakSet.find(*object()); + weakSet.contains(*object()); + weakSet.add(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + weakSet.remove(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + + Vector> vector; + vector.at(0); + vector[0]; + vector.find(*object()); + vector.reverseFind(*object()); + vector.contains(*object()); + vector.append(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + vector.remove(*object()); + // expected-warning@-1{{Call argument is uncounted and unsafe}} + + auto* obj = object(); + vector.findIf([&](Ref key) { return key.ptr() == obj; }); + vector.reverseFindIf([&](Ref key) { return key.ptr() == obj; }); + vector.containsIf([&](Ref key) { return key.ptr() == obj; }); +} \ No newline at end of file From 84ed55e11f8d8f434395f869a1caa8485dd0c187 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 20 Feb 2024 20:24:32 -0500 Subject: [PATCH 037/351] Revert "[clang][ScanDeps] Canonicalize -D and -U flags (#82298)" This reverts commit 3ff805540173b83d73b673b39ac5760fc19bac15. Test is failing on bots, see https://github.com/llvm/llvm-project/pull/82298#issuecomment-1955664462 --- .../DependencyScanningService.h | 5 +- .../DependencyScanningWorker.cpp | 74 ---------------- .../optimize-canonicalize-macros.m | 87 ------------------- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 1 - 4 files changed, 1 insertion(+), 166 deletions(-) delete mode 100644 clang/test/ClangScanDeps/optimize-canonicalize-macros.m diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h index 557f0e547ab4a..4f9867262a275 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h @@ -60,10 +60,7 @@ enum class ScanningOptimizations { /// Remove unused -ivfsoverlay arguments. VFS = 4, - /// Canonicalize -D and -U options. - Macros = 8, - - DSS_LAST_BITMASK_ENUM(Macros), + DSS_LAST_BITMASK_ENUM(VFS), Default = All }; diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 7477b930188b4..3cf3ad8a4e490 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -179,78 +179,6 @@ static void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) { DiagOpts.IgnoreWarnings = true; } -// Clang implements -D and -U by splatting text into a predefines buffer. This -// allows constructs such as `-DFඞ=3 "-D F\u{0D9E} 4 3 2”` to be accepted and -// define the same macro, or adding C++ style comments before the macro name. -// -// This function checks that the first non-space characters in the macro -// obviously form an identifier that can be uniqued on without lexing. Failing -// to do this could lead to changing the final definition of a macro. -// -// We could set up a preprocessor and actually lex the name, but that's very -// heavyweight for a situation that will almost never happen in practice. -static std::optional getSimpleMacroName(StringRef Macro) { - StringRef Name = Macro.split("=").first.ltrim(" \t"); - std::size_t I = 0; - - auto FinishName = [&]() -> std::optional { - StringRef SimpleName = Name.slice(0, I); - if (SimpleName.empty()) - return std::nullopt; - return SimpleName; - }; - - for (; I != Name.size(); ++I) { - switch (Name[I]) { - case '(': // Start of macro parameter list - case ' ': // End of macro name - case '\t': - return FinishName(); - case '_': - continue; - default: - if (llvm::isAlnum(Name[I])) - continue; - return std::nullopt; - } - } - return FinishName(); -} - -static void canonicalizeDefines(PreprocessorOptions &PPOpts) { - using MacroOpt = std::pair; - std::vector SimpleNames; - SimpleNames.reserve(PPOpts.Macros.size()); - std::size_t Index = 0; - for (const auto &M : PPOpts.Macros) { - auto SName = getSimpleMacroName(M.first); - // Skip optimizing if we can't guarantee we can preserve relative order. - if (!SName) - return; - SimpleNames.emplace_back(*SName, Index); - ++Index; - } - - llvm::stable_sort(SimpleNames, [](const MacroOpt &A, const MacroOpt &B) { - return A.first < B.first; - }); - // Keep the last instance of each macro name by going in reverse - auto NewEnd = std::unique( - SimpleNames.rbegin(), SimpleNames.rend(), - [](const MacroOpt &A, const MacroOpt &B) { return A.first == B.first; }); - SimpleNames.erase(SimpleNames.begin(), NewEnd.base()); - - // Apply permutation. - decltype(PPOpts.Macros) NewMacros; - NewMacros.reserve(SimpleNames.size()); - for (std::size_t I = 0, E = SimpleNames.size(); I != E; ++I) { - std::size_t OriginalIndex = SimpleNames[I].second; - // We still emit undefines here as they may be undefining a predefined macro - NewMacros.push_back(std::move(PPOpts.Macros[OriginalIndex])); - } - std::swap(PPOpts.Macros, NewMacros); -} - /// A clang tool that runs the preprocessor in a mode that's optimized for /// dependency scanning for the given compiler invocation. class DependencyScanningAction : public tooling::ToolAction { @@ -275,8 +203,6 @@ class DependencyScanningAction : public tooling::ToolAction { CompilerInvocation OriginalInvocation(*Invocation); // Restore the value of DisableFree, which may be modified by Tooling. OriginalInvocation.getFrontendOpts().DisableFree = DisableFree; - if (any(OptimizeArgs & ScanningOptimizations::Macros)) - canonicalizeDefines(OriginalInvocation.getPreprocessorOpts()); if (Scanned) { // Scanning runs once for the first -cc1 invocation in a chain of driver diff --git a/clang/test/ClangScanDeps/optimize-canonicalize-macros.m b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m deleted file mode 100644 index 2c9b06be39210..0000000000000 --- a/clang/test/ClangScanDeps/optimize-canonicalize-macros.m +++ /dev/null @@ -1,87 +0,0 @@ -// This test verifies that command lines with equivalent -D and -U arguments -// are canonicalized to the same module variant. - -// RUN: rm -rf %t -// RUN: split-file %s %t -// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands.json.in > %t/build/compile-commands.json -// RUN: clang-scan-deps -compilation-database %t/build/compile-commands.json \ -// RUN: -j 1 -format experimental-full -optimize-args=canonicalize-macros > %t/deps.db -// RUN: cat %t/deps.db | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t - -// Verify that there are only two variants and that the expected merges have -// happened. - -// CHECK: { -// CHECK-NEXT: "modules": [ -// CHECK-NEXT: { -// CHECK-NEXT: "clang-module-deps": [], -// CHECK-NEXT: "clang-modulemap-file": -// CHECK-NEXT: "command-line": [ -// CHECK-NOT: "J=1" -// CHECK-NOT: "J" -// CHECK-NOT: "K" -// CHECK: ], -// CHECK-NEXT: "context-hash": "{{.*}}", -// CHECK-NEXT: "file-deps": [ -// CHECK: ], -// CHECK-NEXT: "name": "A" -// CHECK-NEXT: }, -// CHECK-NEXT: { -// CHECK-NEXT: "clang-module-deps": [], -// CHECK-NEXT: "clang-modulemap-file": -// CHECK-NEXT: "command-line": [ -// CHECK: "Fඞ" -// CHECK: "F\\u{0D9E}" -// CHECK: "K" -// CHECK: "K" -// CHECK: ], -// CHECK-NEXT: "context-hash": "{{.*}}", -// CHECK-NEXT: "file-deps": [ -// CHECK: ], -// CHECK-NEXT: "name": "A" -// CHECK-NEXT: } -// CHECK-NEXT: ], -// CHECK-NEXT: "translation-units": [ -// CHECK: ] -// CHECK: } - - -//--- build/compile-commands.json.in - -[ -{ - "directory": "DIR", - "command": "clang -c DIR/tu0.m -DJ=1 -UJ -DJ=2 -DI -DK(x)=x -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", - "file": "DIR/tu0.m" -}, -{ - "directory": "DIR", - "command": "clang -c DIR/tu1.m -DK -DK(x)=x -DI -D \"J=2\" -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", - "file": "DIR/tu1.m" -}, -{ - "directory": "DIR", - "command": "clang -c DIR/tu2.m -I modules/A -DFඞ '-DF\\u{0D9E}' -DK -DK -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", - "file": "DIR/tu2.m" -} -] - -//--- modules/A/module.modulemap - -module A { - umbrella header "A.h" -} - -//--- modules/A/A.h - -//--- tu0.m - -#include - -//--- tu1.m - -#include - -//--- tu2.m - -#include diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 9811d2a875335..0458a4b3ecec3 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -157,7 +157,6 @@ static void ParseArgs(int argc, char **argv) { .Case("header-search", ScanningOptimizations::HeaderSearch) .Case("system-warnings", ScanningOptimizations::SystemWarnings) .Case("vfs", ScanningOptimizations::VFS) - .Case("canonicalize-macros", ScanningOptimizations::Macros) .Case("all", ScanningOptimizations::All) .Default(std::nullopt); if (!Optimization) { From 8603a7b21f301508d3a6af9f2238c7b92ce19617 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 20 Feb 2024 17:18:03 -0800 Subject: [PATCH 038/351] [RISCV] Add a query for exact VLEN to RISCVSubtarget [nfc] We've now got enough of these in tree that we can see which patterns appear to be idiomatic. As such, extract a helper for checking if we know the exact VLEN. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 4 ++-- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 5 ++--- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +++---- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 8 ++++---- llvm/lib/Target/RISCV/RISCVSubtarget.h | 8 ++++++++ 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 7e3dcb3283cab..8bac41372b5a8 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -399,9 +399,9 @@ void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF, // Optimize compile time offset case StackOffset Offset = StackOffset::getScalable(Amount); - if (STI.getRealMinVLen() == STI.getRealMaxVLen()) { + if (auto VLEN = STI.getRealVLen()) { // 1. Multiply the number of v-slots by the (constant) length of register - const int64_t VLENB = STI.getRealMinVLen() / 8; + const int64_t VLENB = *VLEN / 8; assert(Amount % 8 == 0 && "Reserve the stack by the multiple of one vector size."); const int64_t NumOfVReg = Amount / 8; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 80797e36ad40f..904f1d7fdf906 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -577,9 +577,8 @@ void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) { SDValue VLOperand; unsigned Opcode = RISCV::PseudoVSETVLI; if (auto *C = dyn_cast(Node->getOperand(1))) { - const unsigned VLEN = Subtarget->getRealMinVLen(); - if (VLEN == Subtarget->getRealMaxVLen()) - if (VLEN / RISCVVType::getSEWLMULRatio(SEW, VLMul) == C->getZExtValue()) + if (auto VLEN = Subtarget->getRealVLen()) + if (*VLEN / RISCVVType::getSEWLMULRatio(SEW, VLMul) == C->getZExtValue()) VLMax = true; } if (VLMax || isAllOnesConstant(Node->getOperand(1))) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9ab6895aed521..874c851cd9147 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -8092,12 +8092,11 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, // If we're compiling for an exact VLEN value, we can always perform // the insert in m1 as we can determine the register corresponding to // the index in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); const MVT M1VT = getLMUL1VT(ContainerVT); - if (MinVLen == MaxVLen && ContainerVT.bitsGT(M1VT)) { + if (auto VLEN = Subtarget.getRealVLen(); + VLEN && ContainerVT.bitsGT(M1VT)) { EVT ElemVT = VecVT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLEN / ElemVT.getFixedSizeInBits(); unsigned RemIdx = OrigIdx % ElemsPerVReg; unsigned SubRegIdx = OrigIdx / ElemsPerVReg; unsigned ExtractIdx = diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index ca519dbc4c035..9d1f01dffaaf4 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -283,8 +283,8 @@ void RISCVRegisterInfo::lowerVSPILL(MachineBasicBlock::iterator II) const { Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass); // Optimize for constant VLEN. - if (STI.getRealMinVLen() == STI.getRealMaxVLen()) { - const int64_t VLENB = STI.getRealMinVLen() / 8; + if (auto VLEN = STI.getRealVLen()) { + const int64_t VLENB = *VLEN / 8; int64_t Offset = VLENB * LMUL; STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset); } else { @@ -360,8 +360,8 @@ void RISCVRegisterInfo::lowerVRELOAD(MachineBasicBlock::iterator II) const { Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass); // Optimize for constant VLEN. - if (STI.getRealMinVLen() == STI.getRealMaxVLen()) { - const int64_t VLENB = STI.getRealMinVLen() / 8; + if (auto VLEN = STI.getRealVLen()) { + const int64_t VLENB = *VLEN / 8; int64_t Offset = VLENB * LMUL; STI.getInstrInfo()->movImm(MBB, II, DL, VL, Offset); } else { diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 8c55efa69a6a5..4b60d7aff22a0 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -188,6 +188,14 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { unsigned VLen = getMaxRVVVectorSizeInBits(); return VLen == 0 ? 65536 : VLen; } + // If we know the exact VLEN, return it. Otherwise, return std::nullopt. + std::optional getRealVLen() const { + unsigned Min = getRealMinVLen(); + if (Min != getRealMaxVLen()) + return std::nullopt; + return Min; + } + RISCVABI::ABI getTargetABI() const { return TargetABI; } bool isSoftFPABI() const { return TargetABI == RISCVABI::ABI_LP64 || From b9a071dc3995c1599724447b9db8ced449318839 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Tue, 20 Feb 2024 17:39:58 -0800 Subject: [PATCH 039/351] [mlir][Linalg] Add folders for `linalg.transpose` (#81709) This PR adds folders for linalg transpose ops with only one dimension or an identity permutation. The folding removes the `linalg.transpose` and just propagates the input tensor. --- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 3 +- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 16 +++++++++ mlir/test/Dialect/Linalg/canonicalize.mlir | 35 +++++++++++++++++++ .../Linalg/generalize-tensor-pack-tile.mlir | 16 +++------ .../Linalg/generalize-tensor-pack.mlir | 16 +++------ .../Linalg/generalize-tensor-unpack-tile.mlir | 16 +++------ .../Linalg/generalize-tensor-unpack.mlir | 17 +++------ 7 files changed, 69 insertions(+), 50 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 272bc3116c5fd..92d844eefb720 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -245,7 +245,7 @@ def MapOp : LinalgStructuredBase_Op<"map", [ } ``` - Shortened print form is available. Applies to simple maps with one + Shortened print form is available. Applies to simple maps with one non-yield operation inside the body. The example above will be printed as: @@ -458,6 +458,7 @@ def TransposeOp : LinalgStructuredBase_Op<"transpose", [ ::mlir::OperationState & odsState); }]; + let hasFolder = 1; let hasCustomAssemblyFormat = 1; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index a0f02f6a7f259..919f5130e1760 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1786,6 +1786,22 @@ void TransposeOp::getEffects( getDpsInits()); } +LogicalResult TransposeOp::fold(FoldAdaptor adaptor, + SmallVectorImpl &result) { + // Single dimension transpose. + if (getPermutation().size() == 0) { + result.push_back(getInput()); + return success(); + } + // Identity permutation. + if (isIdentityPermutation(getPermutation())) { + result.push_back(getInput()); + return success(); + } + + return failure(); +} + //===----------------------------------------------------------------------===// // BroadcastOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index 721f35162ef86..7adde3117deea 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -1029,3 +1029,38 @@ func.func @broadcast_same_shape(%input: tensor<2x3xf32>, %init: tensor<2x3xf32>) %0 = linalg.broadcast ins(%input: tensor<2x3xf32>) outs(%init: tensor<2x3xf32>) dimensions = [] return %0 : tensor<2x3xf32> } + +// ---- + +func.func @transpose_1d(%input: tensor<16xf32>, + %init: tensor<16xf32>) -> tensor<16xf32> { + %transpose = linalg.transpose + ins(%input:tensor<16xf32>) + outs(%init:tensor<16xf32>) + permutation = [0] + func.return %transpose : tensor<16xf32> +} + +// CHECK-LABEL: func @transpose_1d( +// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]+]]: tensor<16xf32>, +// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<16xf32>) +// CHECK-NOT: linalg.transpose +// CHECK: return %[[INPUT]] : tensor<16xf32> + +// ----- + +func.func @transpose_identity_perm(%input: tensor<16x32x64xf32>, + %init: tensor<16x32x64xf32>) -> tensor<16x32x64xf32> { + %transpose = linalg.transpose + ins(%input:tensor<16x32x64xf32>) + outs(%init:tensor<16x32x64xf32>) + permutation = [0, 1, 2] + func.return %transpose : tensor<16x32x64xf32> +} + +// CHECK-LABEL: func @transpose_identity_perm( +// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]+]]: tensor<16x32x64xf32>, +// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<16x32x64xf32>) +// CHECK-NOT: linalg.transpose +// CHECK: return %[[INPUT]] : tensor<16x32x64xf32> + diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir index d63433248ab1e..0a197a0ee9fa6 100644 --- a/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir +++ b/mlir/test/Dialect/Linalg/generalize-tensor-pack-tile.mlir @@ -48,12 +48,8 @@ func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %a // CHECK: %[[PAD:.+]] = tensor.pad %[[SRC_SLICE]] // CHECK: tensor.yield %[[PAD_VAL]] // CHECK: } : tensor to tensor<8x2xf32> -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[PAD]] : tensor<8x2xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor<8x2xf32>) -// CHECK-SAME: permutation = [0, 1] -// CHECK: %{{.+}} = tensor.insert_slice %[[TRANSP]] into %{{.+}} +// CHECK-NOT: linalg.transpose +// CHECK: %{{.+}} = tensor.insert_slice %[[PAD]] into %{{.+}} module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { @@ -81,12 +77,8 @@ func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) // CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]]) // CHECK: %[[TILE:.+]] = tensor.extract_slice %[[SRC]] // CHECK-SAME: [%[[IN_K]], %[[IN_C]]] [32, 8] [1, 1] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[TILE]] -// CHECK-SAME: outs(%[[EMPTY]] -// CHECK-SAME: permutation = [0, 1] -// CHECK: %[[SUB_ITER:.+]] = tensor.insert_slice %[[TRANSP]] into %{{[a-zA-Z0-9]+}} +// CHECK-NOT: linalg.transpose +// CHECK: %[[SUB_ITER:.+]] = tensor.insert_slice %[[TILE]] into %{{[a-zA-Z0-9]+}} // CHECK-SAME: [0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<32x8xf32> into tensor<1x1x32x8xf32> // CHECK: %{{.+}} = tensor.insert_slice %[[SUB_ITER]] into %{{[a-zA-Z0-9]+}} // CHECK-SAME: [%[[C]], %[[K]], 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<1x1x32x8xf32> into tensor<32x4x32x8xf32> diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir index eaad6bd827047..7d87a0994004f 100644 --- a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir +++ b/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir @@ -29,12 +29,8 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x // CHECK-SAME: %[[PAD_VAL:[a-zA-Z0-9]+]] // CHECK: %[[PAD:.+]] = tensor.pad %[[SRC]] low[0, 0] high[3, 1] // CHECK: tensor.yield %[[PAD_VAL]] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[PAD]] : tensor<8x2xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor<8x2xf32>) -// CHECK-SAME: permutation = [0, 1] -// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]] +// CHECK-NOT: linalg.transpose +// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[PAD]] into %[[DEST]] // CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1] // CHECK: return %[[INSERT]] @@ -47,12 +43,8 @@ func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32 // CHECK-LABEL: func.func @simple_NC_to_CNnc // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[SRC]] : tensor<32x8xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor<32x8xf32>) -// CHECK-SAME: permutation = [0, 1] -// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]] +// CHECK-NOT: linalg.transpose +// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[SRC]] into %[[DEST]] // CHECK-SAME: [0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] // CHECK: return %[[INSERT]] diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir index f0d4b790520e0..7d64331c98784 100644 --- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir +++ b/mlir/test/Dialect/Linalg/generalize-tensor-unpack-tile.mlir @@ -57,12 +57,8 @@ func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13 // CHECK-SAME: [%[[I]], %[[J]]] [%[[OUT_I_SZ]], %[[OUT_J_SZ]]] // CHECK: %[[TILE:.+]] = tensor.extract_slice %[[SRC_SLICE]] // CHECK-SAME: [0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1] : tensor<1x1x8x2xf32> to tensor<8x2xf32> -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[TILE]] : tensor<8x2xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor<8x2xf32>) -// CHECK-SAME: permutation = [0, 1] -// CHECK: %[[UNPACK_TILE:.+]] = tensor.extract_slice %[[TRANSP]] +// CHECK-NOT: linalg.transpose +// CHECK: %[[UNPACK_TILE:.+]] = tensor.extract_slice %[[TILE]] // CHECK-SAME: [0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]]] [1, 1] // CHECK: %[[INSERT1:.+]] = tensor.insert_slice %[[UNPACK_TILE]] into %[[ITER_SLICE]] // CHECK-SAME: [0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]]] [1, 1] @@ -96,12 +92,8 @@ func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>) // CHECK-SAME: [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] // CHECK: %[[TILE:.+]] = tensor.extract_slice %[[SRC_SLICE]] // CHECK-SAME: [0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<1x1x32x8xf32> to tensor<32x8xf32> -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[TILE]] -// CHECK-SAME: outs(%[[EMPTY]] -// CHECK-SAME: permutation = [0, 1] -// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %{{[a-zA-Z0-9]+}} +// CHECK-NOT: linalg.transpose +// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[TILE]] into %{{[a-zA-Z0-9]+}} // CHECK-SAME: [%[[K]], %[[C]]] [32, 8] [1, 1] diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir index 0237680886500..153ce68b8f086 100644 --- a/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir +++ b/mlir/test/Dialect/Linalg/generalize-tensor-unpack.mlir @@ -27,14 +27,10 @@ func.func @simple_unpack_and_extract_slice(%input: tensor<1x1x8x2xf32>, %output: // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[TILE:.+]] = tensor.extract_slice %[[SRC]][0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[TILE]] : tensor<8x2xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor<8x2xf32>) -// CHECK-SAME: permutation = [0, 1] +// CHECK-NOT: linalg.transpose // They have the same type, so the insert_slice op is folded // away. -// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[TRANSP]][0, 0] [5, 1] [1, 1] +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[TILE]][0, 0] [5, 1] [1, 1] // CHECK: return %[[SLICE]] // ----- @@ -47,14 +43,10 @@ func.func @simple_CNnc_to_NC(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<32x8xf32 // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[TILE:.+]] = tensor.extract_slice %[[SRC]][0, 0, 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x8xf32> -// CHECK: %[[TRANSP:.+]] = linalg.transpose -// CHECK-SAME: ins(%[[TILE]] : tensor<32x8xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor<32x8xf32>) -// CHECK-SAME: permutation = [0, 1] +// CHECK-NOT: linalg.transpose // They have the same type, so the insert_slice op is folded // away. -// CHECK: return %[[TRANSP]] +// CHECK: return %[[TILE]] // ----- @@ -75,7 +67,6 @@ func.func @simple_NCHWc_to_NCHW(%arg0: tensor<2x1x16x8x32xf32>, %arg1: tensor<2x // away. // CHECK: return %[[TRANSP]] - // ----- func.func @simple_NHWC_to_NCHW(%arg0: tensor<1x16x8x32xf32>, %arg1: tensor<1x32x16x8xf32>) -> tensor<1x32x16x8xf32> { From 2836d8edbfbcd461b25101ed58f93c862d65903a Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 20 Feb 2024 17:52:38 -0800 Subject: [PATCH 040/351] [workflows] Fix permissions check for creating new releases (#81163) The default GitHub token does not have read permissions on the org, so we need to use a custom token in order to read the members of the llvm-release-managers team. --- .github/workflows/release-tasks.yml | 4 +++- llvm/utils/release/github-upload-release.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml index f2a831ad3577a..53da8662b0203 100644 --- a/.github/workflows/release-tasks.yml +++ b/.github/workflows/release-tasks.yml @@ -28,6 +28,7 @@ jobs: name: Create a New Release runs-on: ubuntu-latest needs: validate-tag + steps: - name: Install Dependencies run: | @@ -40,8 +41,9 @@ jobs: - name: Create Release env: GITHUB_TOKEN: ${{ github.token }} + USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }} run: | - ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} create + ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} --user-token "$USER_TOKEN" create release-documentation: name: Build and Upload Release Documentation needs: diff --git a/llvm/utils/release/github-upload-release.py b/llvm/utils/release/github-upload-release.py index a8bb569d2fc99..14ec05062d88c 100755 --- a/llvm/utils/release/github-upload-release.py +++ b/llvm/utils/release/github-upload-release.py @@ -77,20 +77,28 @@ def upload_files(repo, release, files): parser.add_argument("--token", type=str) parser.add_argument("--release", type=str) parser.add_argument("--user", type=str) +parser.add_argument("--user-token", type=str) # Upload args parser.add_argument("--files", nargs="+", type=str) args = parser.parse_args() -github = github.Github(args.token) -llvm_org = github.get_organization("llvm") +gh = github.Github(args.token) +llvm_org = gh.get_organization("llvm") llvm_repo = llvm_org.get_repo("llvm-project") if args.user: + if not args.user_token: + print("--user-token option required when --user is used") + sys.exit(1) # Validate that this user is allowed to modify releases. - user = github.get_user(args.user) - team = llvm_org.get_team_by_slug("llvm-release-managers") + user = gh.get_user(args.user) + team = ( + github.Github(args.user_token) + .get_organization("llvm") + .get_team_by_slug("llvm-release-managers") + ) if not team.has_in_members(user): print("User {} is not a allowed to modify releases".format(args.user)) sys.exit(1) From 5a45d32b5b42dc4ed4852b0045391a1c2be41b48 Mon Sep 17 00:00:00 2001 From: Greg Clayton Date: Tue, 20 Feb 2024 18:17:01 -0800 Subject: [PATCH 041/351] [lldb] Add more ways to find the .dwp file. (#81067) When using split DWARF we can run into many different ways to store debug info: - lldb loads `` which contains skeleton DWARF and needs to find `.dwp` - lldb loads `` which is stripped but has .gnu_debuglink pointing to `.debug` with skeleton DWARF and needs to find `.dwp` - lldb loads `` which is stripped but has .gnu_debuglink pointing to `.debug` with skeleton DWARF and needs to find `.debug.dwp` - lldb loads `.debug` and needs to find `.dwp` Previously we only handled the first two cases. This patch adds support for the latter two. --- lldb/include/lldb/Utility/FileSpecList.h | 4 + .../SymbolFile/DWARF/LogChannelDWARF.cpp | 1 + .../SymbolFile/DWARF/LogChannelDWARF.h | 1 + .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 68 +++++++++++---- .../DWARF/x86/dwp-separate-debug-file.cpp | 86 ++++++++++++++++++- 5 files changed, 141 insertions(+), 19 deletions(-) diff --git a/lldb/include/lldb/Utility/FileSpecList.h b/lldb/include/lldb/Utility/FileSpecList.h index 49edc667ddd5b..6eb3bb9971f13 100644 --- a/lldb/include/lldb/Utility/FileSpecList.h +++ b/lldb/include/lldb/Utility/FileSpecList.h @@ -238,6 +238,10 @@ class FileSpecList { const_iterator begin() const { return m_files.begin(); } const_iterator end() const { return m_files.end(); } + llvm::iterator_range files() const { + return llvm::make_range(begin(), end()); + } + protected: collection m_files; ///< A collection of FileSpec objects. }; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp index 6b063f3bd88d8..795355b57a06d 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.cpp @@ -22,6 +22,7 @@ static constexpr Log::Category g_categories[] = { {{"map"}, {"log insertions of object files into DWARF debug maps"}, DWARFLog::DebugMap}, + {{"split"}, {"log split DWARF related activities"}, DWARFLog::SplitDwarf}, }; static Log::Channel g_channel(g_categories, DWARFLog::DebugInfo); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h index 662aa6757e2ff..7f254a1162bd1 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/LogChannelDWARF.h @@ -20,6 +20,7 @@ enum class DWARFLog : Log::MaskType { DebugMap = Log::ChannelFlag<2>, Lookups = Log::ChannelFlag<3>, TypeCompletion = Log::ChannelFlag<4>, + SplitDwarf = Log::ChannelFlag<5>, LLVM_MARK_AS_BITMASK_ENUM(TypeCompletion) }; LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 42211b9a21b0e..84ff4c2565a05 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -4349,26 +4349,60 @@ SymbolFileDWARFDebugMap *SymbolFileDWARF::GetDebugMapSymfile() { const std::shared_ptr &SymbolFileDWARF::GetDwpSymbolFile() { llvm::call_once(m_dwp_symfile_once_flag, [this]() { + // Create a list of files to try and append .dwp to. + FileSpecList symfiles; + // Append the module's object file path. + const FileSpec module_fspec = m_objfile_sp->GetModule()->GetFileSpec(); + symfiles.Append(module_fspec); + // Append the object file for this SymbolFile only if it is different from + // the module's file path. Our main module could be "a.out", our symbol file + // could be "a.debug" and our ".dwp" file might be "a.debug.dwp" instead of + // "a.out.dwp". + const FileSpec symfile_fspec(m_objfile_sp->GetFileSpec()); + if (symfile_fspec != module_fspec) { + symfiles.Append(symfile_fspec); + } else { + // If we don't have a separate debug info file, then try stripping the + // extension. The main module could be "a.debug" and the .dwp file could + // be "a.dwp" instead of "a.debug.dwp". + ConstString filename_no_ext = + module_fspec.GetFileNameStrippingExtension(); + if (filename_no_ext != module_fspec.GetFilename()) { + FileSpec module_spec_no_ext(module_fspec); + module_spec_no_ext.SetFilename(filename_no_ext); + symfiles.Append(module_spec_no_ext); + } + } + Log *log = GetLog(DWARFLog::SplitDwarf); + FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths(); ModuleSpec module_spec; module_spec.GetFileSpec() = m_objfile_sp->GetFileSpec(); - module_spec.GetSymbolFileSpec() = - FileSpec(m_objfile_sp->GetModule()->GetFileSpec().GetPath() + ".dwp"); - module_spec.GetUUID() = m_objfile_sp->GetUUID(); - FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths(); - FileSpec dwp_filespec = - PluginManager::LocateExecutableSymbolFile(module_spec, search_paths); - if (FileSystem::Instance().Exists(dwp_filespec)) { - DataBufferSP dwp_file_data_sp; - lldb::offset_t dwp_file_data_offset = 0; - ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin( - GetObjectFile()->GetModule(), &dwp_filespec, 0, - FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp, - dwp_file_data_offset); - if (!dwp_obj_file) - return; - m_dwp_symfile = std::make_shared( - *this, dwp_obj_file, DIERef::k_file_index_mask); + for (const auto &symfile : symfiles.files()) { + module_spec.GetSymbolFileSpec() = + FileSpec(symfile.GetPath() + ".dwp", symfile.GetPathStyle()); + LLDB_LOG(log, "Searching for DWP using: \"{0}\"", + module_spec.GetSymbolFileSpec()); + FileSpec dwp_filespec = + PluginManager::LocateExecutableSymbolFile(module_spec, search_paths); + if (FileSystem::Instance().Exists(dwp_filespec)) { + LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec); + DataBufferSP dwp_file_data_sp; + lldb::offset_t dwp_file_data_offset = 0; + ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin( + GetObjectFile()->GetModule(), &dwp_filespec, 0, + FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp, + dwp_file_data_offset); + if (dwp_obj_file) { + m_dwp_symfile = std::make_shared( + *this, dwp_obj_file, DIERef::k_file_index_mask); + break; + } + } + } + if (!m_dwp_symfile) { + LLDB_LOG(log, "Unable to locate for DWP file for: \"{0}\"", + m_objfile_sp->GetModule()->GetFileSpec()); } }); return m_dwp_symfile; diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp index a47209931c384..9a8149065b6e5 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-separate-debug-file.cpp @@ -1,12 +1,16 @@ // REQUIRES: lld +// Now test with DWARF5 // RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.dwarf5.o // RUN: ld.lld %t.dwarf5.o -o %t.dwarf5 // RUN: llvm-dwp %t.dwarf5.dwo -o %t.dwarf5.dwp // RUN: rm %t.dwarf5.dwo // RUN: llvm-objcopy --only-keep-debug %t.dwarf5 %t.dwarf5.debug // RUN: llvm-objcopy --strip-all --add-gnu-debuglink=%t.dwarf5.debug %t.dwarf5 -// RUN: %lldb %t.dwarf5 -o "target variable a" -b | FileCheck %s +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "target variable a" \ +// RUN: -b %t.dwarf5 | FileCheck %s // Run one time with the index cache enabled to populate the index cache. When // we populate the index cache we have to parse all of the DWARF debug info @@ -34,6 +38,31 @@ // RUN: -o "statistics dump" \ // RUN: %t.dwarf5 -b | FileCheck %s -check-prefix=CACHED +// Make sure that if we load the "%t.dwarf5.debug" file, that we can find and +// load the .dwo file from the .dwp when it is "%t.dwarf5.dwp" +// RUN: %lldb %t.dwarf5.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG + +// Make sure that if we load the "%t.dwarf5" file, that we can find and +// load the .dwo file from the .dwp when it is "%t.dwarf5.debug.dwp" +// RUN: mv %t.dwarf5.dwp %t.dwarf5.debug.dwp +// RUN: %lldb %t.dwarf5 -o "b main" -b | FileCheck %s -check-prefix=DEBUG + +// Make sure that if we load the "%t.dwarf5.debug" file, that we can find and +// load the .dwo file from the .dwp when it is "%t.dwarf5.debug.dwp" +// RUN: %lldb %t.dwarf5.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG + +// Make sure that if we remove the .dwp file we see an appropriate error. +// RUN: rm %t.dwarf5.debug.dwp +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "b main" \ +// RUN: -b %t.dwarf5 2>&1 | FileCheck %s -check-prefix=NODWP + +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "b main" \ +// RUN: -b %t.dwarf5.debug 2>&1 | FileCheck %s -check-prefix=NODWP + // Now test with DWARF4 // RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-4 -c %s -o %t.dwarf4.o // RUN: ld.lld %t.dwarf4.o -o %t.dwarf4 @@ -41,7 +70,10 @@ // RUN: rm %t.dwarf4.dwo // RUN: llvm-objcopy --only-keep-debug %t.dwarf4 %t.dwarf4.debug // RUN: llvm-objcopy --strip-all --add-gnu-debuglink=%t.dwarf4.debug %t.dwarf4 -// RUN: %lldb %t.dwarf4 -o "target variable a" -b | FileCheck %s +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "target variable a" \ +// RUN: -b %t.dwarf4 | FileCheck %s // Run one time with the index cache enabled to populate the index cache. When // we populate the index cache we have to parse all of the DWARF debug info @@ -69,6 +101,46 @@ // RUN: -o "statistics dump" \ // RUN: %t.dwarf4 -b | FileCheck %s -check-prefix=CACHED +// Make sure that if we load the "%t.dwarf4.debug" file, that we can find and +// load the .dwo file from the .dwp when it is "%t.dwarf4.dwp" +// RUN: %lldb %t.dwarf4.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG + +// Make sure that if we load the "%t.dwarf4" file, that we can find and +// load the .dwo file from the .dwp when it is "%t.dwarf4.debug.dwp" +// RUN: mv %t.dwarf4.dwp %t.dwarf4.debug.dwp +// RUN: %lldb %t.dwarf4 -o "b main" -b | FileCheck %s -check-prefix=DEBUG + +// Make sure that if we load the "%t.dwarf4.debug" file, that we can find and +// load the .dwo file from the .dwp when it is "%t.dwarf4.debug.dwp" +// RUN: %lldb %t.dwarf4.debug -o "b main" -b | FileCheck %s -check-prefix=DEBUG + +// Make sure that if we remove the .dwp file we see an appropriate error. +// RUN: rm %t.dwarf4.debug.dwp +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "b main" \ +// RUN: -b %t.dwarf4 2>&1 | FileCheck %s -check-prefix=NODWP + +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "b main" \ +// RUN: -b %t.dwarf4.debug 2>&1 | FileCheck %s -check-prefix=NODWP + +// Test if we have a GNU build ID in our main executable and in our debug file, +// and we have a .dwp file that doesn't, that we can still load our .dwp file. +// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -c %s -o %t.o +// RUN: ld.lld %t.o --build-id=md5 -o %t +// RUN: llvm-dwp %t.dwo -o %t.dwp +// RUN: rm %t.dwo +// RUN: llvm-objcopy --only-keep-debug %t %t.debug +// RUN: llvm-objcopy --strip-all --add-gnu-debuglink=%t.debug %t +// RUN: %lldb \ +// RUN: -O "log enable dwarf split" \ +// RUN: -o "target variable a" \ +// RUN: -b %t | FileCheck %s + +// CHECK: Searching for DWP using: +// CHECK: Found DWP file: // CHECK: (A) a = (x = 47) // CACHE: script lldb.target.modules[0].FindTypes('::A').GetTypeAtIndex(0) @@ -83,6 +155,16 @@ // CACHED-NEXT: } // CACHED: "totalDebugInfoIndexLoadedFromCache": 1 +// Make sure debug information was loaded by verifying that the +// DEBUG: Breakpoint 1: where = dwp-separate-debug-file.cpp.tmp.dwarf{{[45]}}{{(\.debug)?}}`main + {{[0-9]+}} at dwp-separate-debug-file.cpp:{{[0-9]+}}:{{[0-9]+}}, address = {{0x[0-9a-fA-F]+}} + +// Make sure if we load the stripped binary or the debug info file with no .dwp +// nor any .dwo files that we are not able to fine the .dwp or .dwo files. +// NODWP: Searching for DWP using: +// NODWP: Searching for DWP using: +// NODWP: Unable to locate for DWP file for: +// NODWP: unable to locate separate debug file (dwo, dwp). Debugging will be degraded. + struct A { int x = 47; }; From f40ee6e83f263fc4240c5b8d31a7e0e148a28cf6 Mon Sep 17 00:00:00 2001 From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com> Date: Tue, 20 Feb 2024 20:46:34 -0600 Subject: [PATCH 042/351] [mlir][sparse] assemble SoA COO correctly. (#82449) --- .../Transforms/SparseAssembler.cpp | 74 +++++++++---------- mlir/test/Dialect/SparseTensor/external.mlir | 24 ++++++ 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp index 9414d81e6bf5c..cd6b9b4989373 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp @@ -22,13 +22,9 @@ using namespace sparse_tensor; // Helper methods. //===----------------------------------------------------------------------===// -// TODO: reuse StorageLayout::foreachField? - -// TODO: we need COO AoS and SoA - // Convert type range to new types range, with sparse tensors externalized. -void convTypes(TypeRange types, SmallVectorImpl &convTypes, - SmallVectorImpl *extraTypes = nullptr) { +static void convTypes(TypeRange types, SmallVectorImpl &convTypes, + SmallVectorImpl *extraTypes = nullptr) { for (auto type : types) { // All "dense" data passes through unmodified. if (!getSparseTensorEncoding(type)) { @@ -42,29 +38,30 @@ void convTypes(TypeRange types, SmallVectorImpl &convTypes, convTypes.push_back(vtp); if (extraTypes) extraTypes->push_back(vtp); - // Convert the external representations of the pos/crd arrays. - for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) { - const auto lt = stt.getLvlType(lvl); - if (isCompressedLT(lt) || isLooseCompressedLT(lt)) { - auto ptp = RankedTensorType::get(shape, stt.getPosType()); - auto ctp = RankedTensorType::get(shape, stt.getCrdType()); - convTypes.push_back(ptp); - convTypes.push_back(ctp); - if (extraTypes) { - extraTypes->push_back(ptp); - extraTypes->push_back(ctp); - } - } else { - assert(isDenseLT(lt)); // TODO: handle other cases + + // Convert the external representation of the position/coordinate array. + foreachFieldAndTypeInSparseTensor(stt, [&convTypes, extraTypes]( + Type t, FieldIndex, + SparseTensorFieldKind kind, + Level, LevelType) { + if (kind == SparseTensorFieldKind::CrdMemRef || + kind == SparseTensorFieldKind::PosMemRef) { + ShapedType st = t.cast(); + auto rtp = RankedTensorType::get(st.getShape(), st.getElementType()); + convTypes.push_back(rtp); + if (extraTypes) + extraTypes->push_back(rtp); } - } + return true; + }); } } // Convert input and output values to [dis]assemble ops for sparse tensors. -void convVals(OpBuilder &builder, Location loc, TypeRange types, - ValueRange fromVals, ValueRange extraVals, - SmallVectorImpl &toVals, unsigned extra, bool isIn) { +static void convVals(OpBuilder &builder, Location loc, TypeRange types, + ValueRange fromVals, ValueRange extraVals, + SmallVectorImpl &toVals, unsigned extra, + bool isIn) { unsigned idx = 0; for (auto type : types) { // All "dense" data passes through unmodified. @@ -85,29 +82,28 @@ void convVals(OpBuilder &builder, Location loc, TypeRange types, if (!isIn) { inputs.push_back(extraVals[extra++]); retTypes.push_back(RankedTensorType::get(shape, stt.getElementType())); - cntTypes.push_back(builder.getIndexType()); + cntTypes.push_back(builder.getIndexType()); // nnz } + // Collect the external representations of the pos/crd arrays. - for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) { - const auto lt = stt.getLvlType(lvl); - if (isCompressedLT(lt) || isLooseCompressedLT(lt)) { + foreachFieldAndTypeInSparseTensor(stt, [&, isIn](Type t, FieldIndex, + SparseTensorFieldKind kind, + Level, LevelType) { + if (kind == SparseTensorFieldKind::CrdMemRef || + kind == SparseTensorFieldKind::PosMemRef) { if (isIn) { inputs.push_back(fromVals[idx++]); - inputs.push_back(fromVals[idx++]); } else { - Type pTp = stt.getPosType(); - Type cTp = stt.getCrdType(); - inputs.push_back(extraVals[extra++]); + ShapedType st = t.cast(); + auto rtp = RankedTensorType::get(st.getShape(), st.getElementType()); inputs.push_back(extraVals[extra++]); - retTypes.push_back(RankedTensorType::get(shape, pTp)); - retTypes.push_back(RankedTensorType::get(shape, cTp)); - cntTypes.push_back(pTp); - cntTypes.push_back(cTp); + retTypes.push_back(rtp); + cntTypes.push_back(rtp.getElementType()); } - } else { - assert(isDenseLT(lt)); // TODO: handle other cases } - } + return true; + }); + if (isIn) { // Assemble multiple inputs into a single sparse tensor. auto a = builder.create(loc, rtp, inputs); diff --git a/mlir/test/Dialect/SparseTensor/external.mlir b/mlir/test/Dialect/SparseTensor/external.mlir index c17ba13e86c92..b5701ad202426 100644 --- a/mlir/test/Dialect/SparseTensor/external.mlir +++ b/mlir/test/Dialect/SparseTensor/external.mlir @@ -100,3 +100,27 @@ func.func @sparse_out2(%arg0: tensor<64x64xf32>) -> (tensor<64x64xf32>, tensor<6 func.func @sparse_inout(%arg0: tensor<64x64xf32, #sparse>) -> tensor<64x64xf32, #sparse> { return %arg0 : tensor<64x64xf32, #sparse> } + +// ----- + +// CHECK-LABEL: func.func @sparse_inout_coo_soa( +// CHECK-SAME: %[[A:.*0]]: tensor, +// CHECK-SAME: %[[B:.*1]]: tensor, +// CHECK-SAME: %[[C:.*2]]: tensor, +// CHECK-SAME: %[[D:.*3]]: tensor, +// CHECK-SAME: %[[E:.*4]]: tensor, +// CHECK-SAME: %[[F:.*5]]: tensor, +// CHECK-SAME: %[[G:.*6]]: tensor, +// CHECK-SAME: %[[H:.*7]]: tensor) -> (tensor, tensor, tensor, tensor) { +// CHECK: %[[I:.*]] = sparse_tensor.assemble %[[A]], %[[B]], %[[C]], %[[D]] +// CHECK: %[[F:.*]] = call @_internal_sparse_inout_coo_soa(%[[I]]) +// CHECK: sparse_tensor.disassemble %[[F]] +// CHECK: return +// CHECK: } +// CHECK: func.func private @_internal_sparse_inout +#sparse = #sparse_tensor.encoding<{ + map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton(soa)) +}> +func.func @sparse_inout_coo_soa(%arg0: tensor<64x64xf32, #sparse>) -> tensor<64x64xf32, #sparse> { + return %arg0 : tensor<64x64xf32, #sparse> +} From c02b0d008c17cdf8dc46ad930c69311bcd8c7dd4 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Tue, 20 Feb 2024 22:42:14 -0500 Subject: [PATCH 043/351] [GlobalISel] Make sure to check for load barriers when merging G_EXTRACT_VECTOR_ELT into G_LOAD. (#82306) Fixes https://github.com/llvm/llvm-project/issues/78477 --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 12 +++++ .../CodeGen/AArch64/extractvector-of-load.mir | 46 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/extractvector-of-load.mir diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 779ec49f4d13a..e8a5c6fedc395 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1199,6 +1199,18 @@ bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI, if (!VecEltTy.isByteSized()) return false; + // Check for load fold barriers between the extraction and the load. + if (MI.getParent() != LoadMI->getParent()) + return false; + const unsigned MaxIter = 20; + unsigned Iter = 0; + for (auto II = LoadMI->getIterator(), IE = MI.getIterator(); II != IE; ++II) { + if (II->isLoadFoldBarrier()) + return false; + if (Iter++ == MaxIter) + return false; + } + // Check if the new load that we are going to create is legal // if we are in the post-legalization phase. MachineMemOperand MMO = LoadMI->getMMO(); diff --git a/llvm/test/CodeGen/AArch64/extractvector-of-load.mir b/llvm/test/CodeGen/AArch64/extractvector-of-load.mir new file mode 100644 index 0000000000000..43051232b436d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/extractvector-of-load.mir @@ -0,0 +1,46 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: f +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } +liveins: + - { reg: '$x0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: f + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) + ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY]](p0) :: (store (<4 x s32>)) + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<2 x s32>), [[C1]](s64) + ; CHECK-NEXT: $w0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(p0) = COPY $x0 + %3:_(s32) = G_CONSTANT i32 0 + %2:_(<4 x s32>) = G_BUILD_VECTOR %3(s32), %3(s32), %3(s32), %3(s32) + %5:_(s64) = G_CONSTANT i64 0 + %1:_(<2 x s32>) = G_LOAD %0(p0) :: (load (<2 x s32>)) + G_STORE %2(<4 x s32>), %0(p0) :: (store (<4 x s32>)) + %4:_(s32) = G_EXTRACT_VECTOR_ELT %1(<2 x s32>), %5(s64) + $w0 = COPY %4(s32) + RET_ReallyLR implicit $w0 + +... From 03203b79c6247465850ee6e9f3e2399afc35720b Mon Sep 17 00:00:00 2001 From: Michal Paszkowski Date: Tue, 20 Feb 2024 20:04:04 -0800 Subject: [PATCH 044/351] [SPIR-V] Fix vloadn OpenCL builtin lowering (#81148) This pull request fixes an issue with missing vector element count immediate in OpExtInst calls and adds a case for generating bitcasts before GEPs for kernel arguments of non-matching pointer type. The new LITs are based on basic/vload_local and basic/vload_global OpenCL CTS tests. The tests after this change pass SPIR-V validation. --- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 2 + llvm/lib/Target/SPIRV/SPIRVBuiltins.td | 16 +++- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 27 +++--- .../SPIRV/opencl/basic/vstore_private.ll | 95 ------------------- llvm/test/CodeGen/SPIRV/opencl/vload2.ll | 40 ++++++++ .../pointers/getelementptr-kernel-arg-char.ll | 31 ++++++ 6 files changed, 97 insertions(+), 114 deletions(-) delete mode 100644 llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll create mode 100644 llvm/test/CodeGen/SPIRV/opencl/vload2.ll create mode 100644 llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 8a354dd04640b..c1bb27322443f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -141,6 +141,7 @@ struct VectorLoadStoreBuiltin { StringRef Name; InstructionSet::InstructionSet Set; uint32_t Number; + uint32_t ElementCount; bool IsRounded; FPRoundingMode::FPRoundingMode RoundingMode; }; @@ -2042,6 +2043,7 @@ static bool generateVectorLoadStoreInst(const SPIRV::IncomingCall *Call, .addImm(Builtin->Number); for (auto Argument : Call->Arguments) MIB.addUse(Argument); + MIB.addImm(Builtin->ElementCount); // Rounding mode should be passed as a last argument in the MI for builtins // like "vstorea_halfn_r". diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 571cfcfd6e7e5..e6e3560d02f58 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1236,18 +1236,24 @@ class VectorLoadStoreBuiltin { string Name = name; InstructionSet Set = set; bits<32> Number = number; + bits<32> ElementCount = !cond(!not(!eq(!find(name, "2"), -1)) : 2, + !not(!eq(!find(name, "3"), -1)) : 3, + !not(!eq(!find(name, "4"), -1)) : 4, + !not(!eq(!find(name, "8"), -1)) : 8, + !not(!eq(!find(name, "16"), -1)) : 16, + true : 1); bit IsRounded = !not(!eq(!find(name, "_rt"), -1)); FPRoundingMode RoundingMode = !cond(!not(!eq(!find(name, "_rte"), -1)) : RTE, - !not(!eq(!find(name, "_rtz"), -1)) : RTZ, - !not(!eq(!find(name, "_rtp"), -1)) : RTP, - !not(!eq(!find(name, "_rtn"), -1)) : RTN, - true : RTE); + !not(!eq(!find(name, "_rtz"), -1)) : RTZ, + !not(!eq(!find(name, "_rtp"), -1)) : RTP, + !not(!eq(!find(name, "_rtn"), -1)) : RTN, + true : RTE); } // Table gathering all the vector data load/store builtins. def VectorLoadStoreBuiltins : GenericTable { let FilterClass = "VectorLoadStoreBuiltin"; - let Fields = ["Name", "Set", "Number", "IsRounded", "RoundingMode"]; + let Fields = ["Name", "Set", "Number", "ElementCount", "IsRounded", "RoundingMode"]; string TypeOf_Set = "InstructionSet"; string TypeOf_RoundingMode = "FPRoundingMode"; } diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 26a5d7a30f19d..e32cd50be56e3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -290,25 +290,14 @@ void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) { Value *Pointer; Type *ExpectedElementType; unsigned OperandToReplace; - bool AllowCastingToChar = false; StoreInst *SI = dyn_cast(I); if (SI && F->getCallingConv() == CallingConv::SPIR_KERNEL && SI->getValueOperand()->getType()->isPointerTy() && isa(SI->getValueOperand())) { - Argument *Arg = cast(SI->getValueOperand()); - MDString *ArgType = getOCLKernelArgType(*Arg->getParent(), Arg->getArgNo()); - if (!ArgType || ArgType->getString().starts_with("uchar*")) - return; - - // Handle special case when StoreInst's value operand is a kernel argument - // of a pointer type. Since these arguments could have either a basic - // element type (e.g. float*) or OpenCL builtin type (sampler_t), bitcast - // the StoreInst's value operand to default pointer element type (i8). - Pointer = Arg; + Pointer = SI->getValueOperand(); ExpectedElementType = IntegerType::getInt8Ty(F->getContext()); OperandToReplace = 0; - AllowCastingToChar = true; } else if (SI) { Pointer = SI->getPointerOperand(); ExpectedElementType = SI->getValueOperand()->getType(); @@ -390,10 +379,20 @@ void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) { } // Do not emit spv_ptrcast if it would cast to the default pointer element - // type (i8) of the same address space. - if (ExpectedElementType->isIntegerTy(8) && !AllowCastingToChar) + // type (i8) of the same address space. In case of OpenCL kernels, make sure + // i8 is the pointer element type defined for the given kernel argument. + if (ExpectedElementType->isIntegerTy(8) && + F->getCallingConv() != CallingConv::SPIR_KERNEL) return; + Argument *Arg = dyn_cast(Pointer); + if (ExpectedElementType->isIntegerTy(8) && + F->getCallingConv() == CallingConv::SPIR_KERNEL && Arg) { + MDString *ArgType = getOCLKernelArgType(*Arg->getParent(), Arg->getArgNo()); + if (ArgType && ArgType->getString().starts_with("uchar*")) + return; + } + // If this would be the first spv_ptrcast, the pointer's defining instruction // requires spv_assign_ptr_type and does not already have one, do not emit // spv_ptrcast and emit spv_assign_ptr_type instead. diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll deleted file mode 100644 index 40f1d59e4365e..0000000000000 --- a/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll +++ /dev/null @@ -1,95 +0,0 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s - -; TODO(#60133): Requires updates following opaque pointer migration. -; XFAIL: * - -; CHECK: %[[#i16_ty:]] = OpTypeInt 16 0 -; CHECK: %[[#v4xi16_ty:]] = OpTypeVector %[[#i16_ty]] 4 -; CHECK: %[[#pv4xi16_ty:]] = OpTypePointer Function %[[#v4xi16_ty]] -; CHECK: %[[#i16_const0:]] = OpConstant %[[#i16_ty]] 0 -; CHECK: %[[#i16_undef:]] = OpUndef %[[#i16_ty]] -; CHECK: %[[#comp_const:]] = OpConstantComposite %[[#v4xi16_ty]] %[[#i16_const0]] %[[#i16_const0]] %[[#i16_const0]] %[[#i16_undef]] - -; CHECK: %[[#r:]] = OpInBoundsPtrAccessChain -; CHECK: %[[#r2:]] = OpBitcast %[[#pv4xi16_ty]] %[[#r]] -; CHECK: OpStore %[[#r2]] %[[#comp_const]] Aligned 8 - -define spir_kernel void @test_fn(i16 addrspace(1)* %srcValues, i32 addrspace(1)* %offsets, <3 x i16> addrspace(1)* %destBuffer, i32 %alignmentOffset) { -entry: - %sPrivateStorage = alloca [42 x <3 x i16>], align 8 - %0 = bitcast [42 x <3 x i16>]* %sPrivateStorage to i8* - %1 = bitcast i8* %0 to i8* - call void @llvm.lifetime.start.p0i8(i64 336, i8* %1) - %2 = call spir_func <3 x i64> @BuiltInGlobalInvocationId() - %call = extractelement <3 x i64> %2, i32 0 - %conv = trunc i64 %call to i32 - %idxprom = sext i32 %conv to i64 - %arrayidx = getelementptr inbounds [42 x <3 x i16>], [42 x <3 x i16>]* %sPrivateStorage, i64 0, i64 %idxprom - %storetmp = bitcast <3 x i16>* %arrayidx to <4 x i16>* - store <4 x i16> , <4 x i16>* %storetmp, align 8 - %conv1 = sext i32 %conv to i64 - %call2 = call spir_func <3 x i16> @OpenCL_vload3_i64_p1i16_i32(i64 %conv1, i16 addrspace(1)* %srcValues, i32 3) - %idxprom3 = sext i32 %conv to i64 - %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %idxprom3 - %3 = load i32, i32 addrspace(1)* %arrayidx4, align 4 - %conv5 = zext i32 %3 to i64 - %arraydecay = getelementptr inbounds [42 x <3 x i16>], [42 x <3 x i16>]* %sPrivateStorage, i64 0, i64 0 - %4 = bitcast <3 x i16>* %arraydecay to i16* - %idx.ext = zext i32 %alignmentOffset to i64 - %add.ptr = getelementptr inbounds i16, i16* %4, i64 %idx.ext - call spir_func void @OpenCL_vstore3_v3i16_i64_p0i16(<3 x i16> %call2, i64 %conv5, i16* %add.ptr) - %arraydecay6 = getelementptr inbounds [42 x <3 x i16>], [42 x <3 x i16>]* %sPrivateStorage, i64 0, i64 0 - %5 = bitcast <3 x i16>* %arraydecay6 to i16* - %idxprom7 = sext i32 %conv to i64 - %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %idxprom7 - %6 = load i32, i32 addrspace(1)* %arrayidx8, align 4 - %mul = mul i32 3, %6 - %idx.ext9 = zext i32 %mul to i64 - %add.ptr10 = getelementptr inbounds i16, i16* %5, i64 %idx.ext9 - %idx.ext11 = zext i32 %alignmentOffset to i64 - %add.ptr12 = getelementptr inbounds i16, i16* %add.ptr10, i64 %idx.ext11 - %7 = bitcast <3 x i16> addrspace(1)* %destBuffer to i16 addrspace(1)* - %idxprom13 = sext i32 %conv to i64 - %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %idxprom13 - %8 = load i32, i32 addrspace(1)* %arrayidx14, align 4 - %mul15 = mul i32 3, %8 - %idx.ext16 = zext i32 %mul15 to i64 - %add.ptr17 = getelementptr inbounds i16, i16 addrspace(1)* %7, i64 %idx.ext16 - %idx.ext18 = zext i32 %alignmentOffset to i64 - %add.ptr19 = getelementptr inbounds i16, i16 addrspace(1)* %add.ptr17, i64 %idx.ext18 - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp ult i32 %i.0, 3 - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %idxprom21 = zext i32 %i.0 to i64 - %arrayidx22 = getelementptr inbounds i16, i16* %add.ptr12, i64 %idxprom21 - %9 = load i16, i16* %arrayidx22, align 2 - %idxprom23 = zext i32 %i.0 to i64 - %arrayidx24 = getelementptr inbounds i16, i16 addrspace(1)* %add.ptr19, i64 %idxprom23 - store i16 %9, i16 addrspace(1)* %arrayidx24, align 2 - br label %for.inc - -for.inc: ; preds = %for.body - %inc = add i32 %i.0, 1 - br label %for.cond - -for.end: ; preds = %for.cond - %10 = bitcast [42 x <3 x i16>]* %sPrivateStorage to i8* - %11 = bitcast i8* %10 to i8* - call void @llvm.lifetime.end.p0i8(i64 336, i8* %11) - ret void -} - -declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) - -declare spir_func <3 x i16> @OpenCL_vload3_i64_p1i16_i32(i64, i16 addrspace(1)*, i32) - -declare spir_func void @OpenCL_vstore3_v3i16_i64_p0i16(<3 x i16>, i64, i16*) - -declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) - -declare spir_func <3 x i64> @BuiltInGlobalInvocationId() diff --git a/llvm/test/CodeGen/SPIRV/opencl/vload2.ll b/llvm/test/CodeGen/SPIRV/opencl/vload2.ll new file mode 100644 index 0000000000000..b219aebc29bef --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/vload2.ll @@ -0,0 +1,40 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; This test only intends to check the vloadn builtin name resolution. +; The calls to the OpenCL builtins are not valid and will not pass SPIR-V validation. + +; CHECK-DAG: %[[#IMPORT:]] = OpExtInstImport "OpenCL.std" + +; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#INT16:]] = OpTypeInt 16 0 +; CHECK-DAG: %[[#INT32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#INT64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#FLOAT:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#VINT8:]] = OpTypeVector %[[#INT8]] 2 +; CHECK-DAG: %[[#VINT16:]] = OpTypeVector %[[#INT16]] 2 +; CHECK-DAG: %[[#VINT32:]] = OpTypeVector %[[#INT32]] 2 +; CHECK-DAG: %[[#VINT64:]] = OpTypeVector %[[#INT64]] 2 +; CHECK-DAG: %[[#VFLOAT:]] = OpTypeVector %[[#FLOAT]] 2 +; CHECK-DAG: %[[#PTRINT8:]] = OpTypePointer CrossWorkgroup %[[#INT8]] + +; CHECK: %[[#OFFSET:]] = OpFunctionParameter %[[#INT64]] +; CHECK: %[[#ADDRESS:]] = OpFunctionParameter %[[#PTRINT8]] + +define spir_kernel void @test_fn(i64 %offset, ptr addrspace(1) %address) { +; CHECK: %[[#]] = OpExtInst %[[#VINT8]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2 + %call1 = call spir_func <2 x i8> @_Z6vload2mPU3AS1Kc(i64 %offset, ptr addrspace(1) %address) +; CHECK: %[[#]] = OpExtInst %[[#VINT16]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2 + %call2 = call spir_func <2 x i16> @_Z6vload2mPU3AS1Ks(i64 %offset, ptr addrspace(1) %address) +; CHECK: %[[#]] = OpExtInst %[[#VINT32]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2 + %call3 = call spir_func <2 x i32> @_Z6vload2mPU3AS1Ki(i64 %offset, ptr addrspace(1) %address) +; CHECK: %[[#]] = OpExtInst %[[#VINT64]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2 + %call4 = call spir_func <2 x i64> @_Z6vload2mPU3AS1Kl(i64 %offset, ptr addrspace(1) %address) +; CHECK: %[[#]] = OpExtInst %[[#VFLOAT]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2 + %call5 = call spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64 %offset, ptr addrspace(1) %address) + ret void +} + +declare spir_func <2 x i8> @_Z6vload2mPU3AS1Kc(i64, ptr addrspace(1)) +declare spir_func <2 x i16> @_Z6vload2mPU3AS1Ks(i64, ptr addrspace(1)) +declare spir_func <2 x i32> @_Z6vload2mPU3AS1Ki(i64, ptr addrspace(1)) +declare spir_func <2 x i64> @_Z6vload2mPU3AS1Kl(i64, ptr addrspace(1)) +declare spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64, ptr addrspace(1)) diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll new file mode 100644 index 0000000000000..cca71d409d258 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll @@ -0,0 +1,31 @@ + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#INT64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#VINT8:]] = OpTypeVector %[[#INT8]] 2 +; CHECK-DAG: %[[#PTRINT8:]] = OpTypePointer Workgroup %[[#INT8]] +; CHECK-DAG: %[[#PTRVINT8:]] = OpTypePointer Workgroup %[[#VINT8]] +; CHECK-DAG: %[[#CONST:]] = OpConstant %[[#INT64]] 1 + +; CHECK: %[[#PARAM1:]] = OpFunctionParameter %[[#PTRVINT8]] +define spir_kernel void @test1(ptr addrspace(3) %address) !kernel_arg_type !1 { +; CHECK: %[[#BITCAST1:]] = OpBitcast %[[#PTRINT8]] %[[#PARAM1]] +; CHECK: %[[#]] = OpInBoundsPtrAccessChain %[[#PTRINT8]] %[[#BITCAST1]] %[[#CONST]] + %cast = bitcast ptr addrspace(3) %address to ptr addrspace(3) + %gep = getelementptr inbounds i8, ptr addrspace(3) %cast, i64 1 + ret void +} + +; CHECK: %[[#PARAM2:]] = OpFunctionParameter %[[#PTRVINT8]] +define spir_kernel void @test2(ptr addrspace(3) %address) !kernel_arg_type !1 { +; CHECK: %[[#BITCAST2:]] = OpBitcast %[[#PTRINT8]] %[[#PARAM2]] +; CHECK: %[[#]] = OpInBoundsPtrAccessChain %[[#PTRINT8]] %[[#BITCAST2]] %[[#CONST]] + %gep = getelementptr inbounds i8, ptr addrspace(3) %address, i64 1 + ret void +} + +declare spir_func <2 x i8> @_Z6vload2mPU3AS3Kc(i64, ptr addrspace(3)) + +!1 = !{!"char2*"} From 79889734b940356ab3381423c93ae06f22e772c9 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Wed, 21 Feb 2024 10:06:37 +0530 Subject: [PATCH 045/351] Implement convergence control in MIR using SelectionDAG (#71785) LLVM function calls carry convergence control tokens as operand bundles, where the tokens themselves are produced by convergence control intrinsics. This patch implements convergence control tokens in MIR as follows: 1. Introduce target-independent ISD opcodes and MIR opcodes for convergence control intrinsics. 2. Model token values as untyped virtual registers in MIR. The change also introduces an additional ISD opcode CONVERGENCECTRL_GLUE and a corresponding machine opcode with the same spelling. This glues the convergence control token to SDNodes that represent calls to intrinsics. The glued token is later translated to an implicit argument in the MIR. The lowering of calls to user-defined functions is target-specific. On AMDGPU, the convergence control operand bundle at a non-intrinsic call is translated to an explicit argument to the SI_CALL_ISEL instruction. Post-selection adjustment converts this explicit argument to an implicit argument on the SI_CALL instruction. --- .../llvm/ADT/GenericConvergenceVerifier.h | 9 +- .../llvm/CodeGen/FunctionLoweringInfo.h | 10 +-- llvm/include/llvm/CodeGen/ISDOpcodes.h | 9 ++ .../llvm/CodeGen/MachineConvergenceVerifier.h | 28 ++++++ llvm/include/llvm/CodeGen/SelectionDAGISel.h | 4 + llvm/include/llvm/CodeGen/TargetLowering.h | 6 ++ .../llvm/IR/GenericConvergenceVerifierImpl.h | 25 +++--- llvm/include/llvm/Support/TargetOpcodes.def | 5 ++ llvm/include/llvm/Target/Target.td | 19 ++++ .../include/llvm/Target/TargetSelectionDAG.td | 10 +++ llvm/lib/CodeGen/CMakeLists.txt | 1 + .../CodeGen/MachineConvergenceVerifier.cpp | 86 +++++++++++++++++++ llvm/lib/CodeGen/MachineVerifier.cpp | 34 ++++++++ .../SelectionDAG/FunctionLoweringInfo.cpp | 10 +++ .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 44 +++++++++- .../SelectionDAG/SelectionDAGBuilder.cpp | 50 ++++++++++- .../SelectionDAG/SelectionDAGBuilder.h | 1 + .../SelectionDAG/SelectionDAGDumper.cpp | 10 +++ .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 24 ++++++ llvm/lib/CodeGen/ValueTypes.cpp | 2 + llvm/lib/IR/ConvergenceVerifier.cpp | 27 ++++-- llvm/lib/IR/Verifier.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 27 +++++- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 12 ++- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 26 +++++- llvm/lib/Target/AMDGPU/SIInstructions.td | 8 +- .../test/CodeGen/AMDGPU/convergence-tokens.ll | 83 ++++++++++++++++++ .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll | 18 ++++ .../kernel-vgpr-spill-mubuf-with-voffset.ll | 1 + .../AMDGPU/need-fp-from-vgpr-spills.ll | 18 ++-- .../AMDGPU/no-source-locations-in-prologue.ll | 1 + .../AMDGPU/sgpr-spills-split-regalloc.ll | 15 ++-- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 78 +++++++++-------- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 26 +++--- .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 1 + .../AMDGPU/whole-wave-register-spill.ll | 1 + .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 2 + llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 4 + llvm/test/CodeGen/PowerPC/fmf-propagation.ll | 12 +-- .../convergencectrl/AMDGPU/basic.mir | 37 ++++++++ .../convergencectrl/AMDGPU/cycles.mir | 52 +++++++++++ .../convergencectrl/AMDGPU/lit.local.cfg | 2 + .../convergencectrl/AMDGPU/mixed2.mir | 15 ++++ .../convergencectrl/AMDGPU/not-ssa.mir | 11 +++ .../convergencectrl/AMDGPU/region-nesting.mir | 24 ++++++ .../builtins/match-table-replacerreg.td | 2 +- .../match-table-imms.td | 30 +++---- .../match-table-intrinsics.td | 2 +- .../match-table-patfrag-root.td | 2 +- .../match-table-variadics.td | 4 +- .../GlobalISelCombinerEmitter/match-table.td | 62 ++++++------- 52 files changed, 831 insertions(+), 162 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h create mode 100644 llvm/lib/CodeGen/MachineConvergenceVerifier.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/convergence-tokens.ll create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir create mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir diff --git a/llvm/include/llvm/ADT/GenericConvergenceVerifier.h b/llvm/include/llvm/ADT/GenericConvergenceVerifier.h index 0810a07013229..d2943cf682f4f 100644 --- a/llvm/include/llvm/ADT/GenericConvergenceVerifier.h +++ b/llvm/include/llvm/ADT/GenericConvergenceVerifier.h @@ -32,11 +32,12 @@ template class GenericConvergenceVerifier { void initialize(raw_ostream *OS, function_ref FailureCB, - const FunctionT &F) { + const FunctionT &F, bool _IsSSA) { clear(); this->OS = OS; this->FailureCB = FailureCB; Context = ContextT(&F); + IsSSA = _IsSSA; } void clear(); @@ -52,6 +53,7 @@ template class GenericConvergenceVerifier { DominatorTreeT *DT; CycleInfoT CI; ContextT Context; + bool IsSSA; /// Whether the current function has convergencectrl operand bundles. enum { @@ -60,6 +62,10 @@ template class GenericConvergenceVerifier { NoConvergence } ConvergenceKind = NoConvergence; + /// The control token operation performed by a convergence control Intrinsic + /// in LLVM IR, or by a CONVERGENCECTRL* instruction in MIR + enum ConvOpKind { CONV_ANCHOR, CONV_ENTRY, CONV_LOOP, CONV_NONE }; + // Cache token uses found so far. Note that we track the unique definitions // and not the token values. DenseMap Tokens; @@ -68,6 +74,7 @@ template class GenericConvergenceVerifier { static bool isInsideConvergentFunction(const InstructionT &I); static bool isConvergent(const InstructionT &I); + static ConvOpKind getConvOp(const InstructionT &I); const InstructionT *findAndCheckConvergenceTokenUsed(const InstructionT &I); void reportFailure(const Twine &Message, ArrayRef Values); diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h index cde7247aeb151..31af3014afe4e 100644 --- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -215,15 +215,7 @@ class FunctionLoweringInfo { Register CreateRegs(Type *Ty, bool isDivergent = false); - Register InitializeRegForValue(const Value *V) { - // Tokens never live in vregs. - if (V->getType()->isTokenTy()) - return 0; - Register &R = ValueMap[V]; - assert(R == 0 && "Already initialized this value register!"); - assert(VirtReg2Value.empty()); - return R = CreateRegs(V); - } + Register InitializeRegForValue(const Value *V); /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the /// register is a PHI destination and the PHI's LiveOutInfo is not valid. diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 8cb0bc9fd9813..079abb3a5be3a 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1384,6 +1384,15 @@ enum NodeType { #define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID, #include "llvm/IR/VPIntrinsics.def" + // The `llvm.experimental.convergence.*` intrinsics. + CONVERGENCECTRL_ANCHOR, + CONVERGENCECTRL_ENTRY, + CONVERGENCECTRL_LOOP, + // This does not correspond to any convergence control intrinsic. It used to + // glue a convergence control token to a convergent operation in the DAG, + // which is later translated to an implicit use in the MIR. + CONVERGENCECTRL_GLUE, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h b/llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h new file mode 100644 index 0000000000000..b2faa30816c68 --- /dev/null +++ b/llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h @@ -0,0 +1,28 @@ +//===- MachineConvergenceVerifier.h - Verify convergenctrl ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file declares the MIR specialization of the GenericConvergenceVerifier +/// template. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINECONVERGENCEVERIFIER_H +#define LLVM_CODEGEN_MACHINECONVERGENCEVERIFIER_H + +#include "llvm/ADT/GenericConvergenceVerifier.h" +#include "llvm/CodeGen/MachineSSAContext.h" + +namespace llvm { + +using MachineConvergenceVerifier = + GenericConvergenceVerifier; + +} // namespace llvm + +#endif // LLVM_CODEGEN_MACHINECONVERGENCEVERIFIER_H diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index dbd9b391f4a43..837f8bf7263ea 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -459,6 +459,10 @@ class SelectionDAGISel : public MachineFunctionPass { void Select_ARITH_FENCE(SDNode *N); void Select_MEMBARRIER(SDNode *N); + void Select_CONVERGENCECTRL_ANCHOR(SDNode *N); + void Select_CONVERGENCECTRL_ENTRY(SDNode *N); + void Select_CONVERGENCECTRL_LOOP(SDNode *N); + void pushStackMapLiveVariable(SmallVectorImpl &Ops, SDValue Operand, SDLoc DL); void Select_STACKMAP(SDNode *N); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 612433b54f6e4..cbdeaf8b38783 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4401,6 +4401,7 @@ class TargetLowering : public TargetLoweringBase { SmallVector Ins; SmallVector InVals; const ConstantInt *CFIType = nullptr; + SDValue ConvergenceControlToken; CallLoweringInfo(SelectionDAG &DAG) : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false), @@ -4534,6 +4535,11 @@ class TargetLowering : public TargetLoweringBase { return *this; } + CallLoweringInfo &setConvergenceControlToken(SDValue Token) { + ConvergenceControlToken = Token; + return *this; + } + ArgListTy &getArgs() { return Args; } diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h index f6eb5066d5535..9c20aa6499ee8 100644 --- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h +++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h @@ -52,6 +52,7 @@ template void GenericConvergenceVerifier::clear() { Tokens.clear(); CI.clear(); ConvergenceKind = NoConvergence; + IsSSA = false; } template @@ -61,12 +62,16 @@ void GenericConvergenceVerifier::visit(const BlockT &BB) { template void GenericConvergenceVerifier::visit(const InstructionT &I) { - auto ID = ContextT::getIntrinsicID(I); + ConvOpKind ConvOp = getConvOp(I); + if (!IsSSA) { + Check(ConvOp == CONV_NONE, "Convergence control requires SSA.", + {Context.print(&I)}); + return; + } auto *TokenDef = findAndCheckConvergenceTokenUsed(I); - bool IsCtrlIntrinsic = true; - switch (ID) { - case Intrinsic::experimental_convergence_entry: + switch (ConvOp) { + case CONV_ENTRY: Check(isInsideConvergentFunction(I), "Entry intrinsic can occur only in a convergent function.", {Context.print(&I)}); @@ -78,13 +83,13 @@ void GenericConvergenceVerifier::visit(const InstructionT &I) { "same basic block.", {Context.print(&I)}); LLVM_FALLTHROUGH; - case Intrinsic::experimental_convergence_anchor: + case CONV_ANCHOR: Check(!TokenDef, "Entry or anchor intrinsic cannot have a convergencectrl token " "operand.", {Context.print(&I)}); break; - case Intrinsic::experimental_convergence_loop: + case CONV_LOOP: Check(TokenDef, "Loop intrinsic must have a convergencectrl token operand.", {Context.print(&I)}); Check(!SeenFirstConvOp, @@ -93,14 +98,13 @@ void GenericConvergenceVerifier::visit(const InstructionT &I) { {Context.print(&I)}); break; default: - IsCtrlIntrinsic = false; break; } if (isConvergent(I)) SeenFirstConvOp = true; - if (TokenDef || IsCtrlIntrinsic) { + if (TokenDef || ConvOp != CONV_NONE) { Check(isConvergent(I), "Convergence control token can only be used in a convergent call.", {Context.print(&I)}); @@ -161,8 +165,7 @@ void GenericConvergenceVerifier::verify(const DominatorTreeT &DT) { return; } - Check(ContextT::getIntrinsicID(*User) == - Intrinsic::experimental_convergence_loop, + Check(getConvOp(*User) == CONV_LOOP, "Convergence token used by an instruction other than " "llvm.experimental.convergence.loop in a cycle that does " "not contain the token's definition.", @@ -199,7 +202,7 @@ void GenericConvergenceVerifier::verify(const DominatorTreeT &DT) { for (auto &I : *BB) { if (auto *Token = Tokens.lookup(&I)) checkToken(Token, &I, LiveTokens); - if (isConvergenceControlIntrinsic(ContextT::getIntrinsicID(I))) + if (getConvOp(I) != CONV_NONE) LiveTokens.push_back(&I); } diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 42cb854d95050..6aded2ceebe13 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -225,6 +225,11 @@ HANDLE_TARGET_OPCODE(MEMBARRIER) // using. HANDLE_TARGET_OPCODE(JUMP_TABLE_DEBUG_INFO) +HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ENTRY) +HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ANCHOR) +HANDLE_TARGET_OPCODE(CONVERGENCECTRL_LOOP) +HANDLE_TARGET_OPCODE(CONVERGENCECTRL_GLUE) + /// The following generic opcodes are not supposed to appear after ISel. /// This is something we might want to relax, but for now, this is convenient /// to produce diagnostics. diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 0d97a47190b19..0577c58f8da2d 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1483,6 +1483,25 @@ def JUMP_TABLE_DEBUG_INFO : StandardPseudoInstruction { let isMeta = true; } +let hasSideEffects = false, isMeta = true, isConvergent = true in { +def CONVERGENCECTRL_ANCHOR : StandardPseudoInstruction { + let OutOperandList = (outs unknown:$dst); + let InOperandList = (ins); +} +def CONVERGENCECTRL_ENTRY : StandardPseudoInstruction { + let OutOperandList = (outs unknown:$dst); + let InOperandList = (ins); +} +def CONVERGENCECTRL_LOOP : StandardPseudoInstruction { + let OutOperandList = (outs unknown:$dst); + let InOperandList = (ins unknown:$src); +} +def CONVERGENCECTRL_GLUE : StandardPseudoInstruction { + let OutOperandList = (outs); + let InOperandList = (ins unknown:$src); +} +} + // Generic opcodes used in GlobalISel. include "llvm/Target/GenericOpcodes.td" diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 5f8bf0d448105..b33c12a125ce5 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -782,6 +782,16 @@ def assertsext : SDNode<"ISD::AssertSext", SDT_assert>; def assertzext : SDNode<"ISD::AssertZext", SDT_assert>; def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>; +def convergencectrl_anchor : SDNode<"ISD::CONVERGENCECTRL_ANCHOR", + SDTypeProfile<1, 0, [SDTCisVT<0,untyped>]>>; +def convergencectrl_entry : SDNode<"ISD::CONVERGENCECTRL_ENTRY", + SDTypeProfile<1, 0, [SDTCisVT<0,untyped>]>>; +def convergencectrl_loop : SDNode<"ISD::CONVERGENCECTRL_LOOP", + SDTypeProfile<1, 1, + [SDTCisVT<0,untyped>, SDTCisVT<1,untyped>]>>; +def convergencectrl_glue : SDNode<"ISD::CONVERGENCECTRL_GLUE", + SDTypeProfile<0, 1, [SDTCisVT<0, untyped>]>>; + //===----------------------------------------------------------------------===// // Selection DAG Condition Codes diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index d49bcf8a0c8ee..82d665b0691d2 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -109,6 +109,7 @@ add_llvm_component_library(LLVMCodeGen MachineBranchProbabilityInfo.cpp MachineCFGPrinter.cpp MachineCombiner.cpp + MachineConvergenceVerifier.cpp MachineCopyPropagation.cpp MachineCSE.cpp MachineCheckDebugify.cpp diff --git a/llvm/lib/CodeGen/MachineConvergenceVerifier.cpp b/llvm/lib/CodeGen/MachineConvergenceVerifier.cpp new file mode 100644 index 0000000000000..2f384fe6204d1 --- /dev/null +++ b/llvm/lib/CodeGen/MachineConvergenceVerifier.cpp @@ -0,0 +1,86 @@ +//===- ConvergenceVerifier.cpp - Verify convergence control -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineConvergenceVerifier.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAContext.h" +#include "llvm/IR/GenericConvergenceVerifierImpl.h" + +using namespace llvm; + +template <> +auto GenericConvergenceVerifier::getConvOp( + const MachineInstr &MI) -> ConvOpKind { + switch (MI.getOpcode()) { + default: + return CONV_NONE; + case TargetOpcode::CONVERGENCECTRL_ENTRY: + return CONV_ENTRY; + case TargetOpcode::CONVERGENCECTRL_ANCHOR: + return CONV_ANCHOR; + case TargetOpcode::CONVERGENCECTRL_LOOP: + return CONV_LOOP; + } +} + +template <> +const MachineInstr * +GenericConvergenceVerifier::findAndCheckConvergenceTokenUsed( + const MachineInstr &MI) { + const MachineRegisterInfo &MRI = Context.getFunction()->getRegInfo(); + const MachineInstr *TokenDef = nullptr; + + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isReg()) + continue; + Register OpReg = MO.getReg(); + if (!OpReg.isVirtual()) + continue; + + const MachineInstr *Def = MRI.getVRegDef(OpReg); + if (!Def) + continue; + if (getConvOp(*Def) == CONV_NONE) + continue; + + CheckOrNull( + MI.isConvergent(), + "Convergence control tokens can only be used by convergent operations.", + {Context.print(OpReg), Context.print(&MI)}); + + CheckOrNull(!TokenDef, + "An operation can use at most one convergence control token.", + {Context.print(OpReg), Context.print(&MI)}); + + TokenDef = Def; + } + + if (TokenDef) + Tokens[&MI] = TokenDef; + + return TokenDef; +} + +template <> +bool GenericConvergenceVerifier::isInsideConvergentFunction( + const MachineInstr &MI) { + // The class MachineFunction does not have any property to indicate whether it + // is convergent. Trivially return true so that the check always passes. + return true; +} + +template <> +bool GenericConvergenceVerifier::isConvergent( + const MachineInstr &MI) { + return MI.isConvergent(); +} + +template class llvm::GenericConvergenceVerifier; diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 2632b5b9feac9..d1635cbd5bc85 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -39,6 +39,8 @@ #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConvergenceVerifier.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -220,6 +222,11 @@ namespace { LiveStacks *LiveStks = nullptr; SlotIndexes *Indexes = nullptr; + // This is calculated only when trying to verify convergence control tokens. + // Similar to the LLVM IR verifier, we calculate this locally instead of + // relying on the pass manager. + MachineDomTree DT; + void visitMachineFunctionBefore(); void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); void visitMachineBundleBefore(const MachineInstr *MI); @@ -2955,7 +2962,34 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) { } } +static void +verifyConvergenceControl(const MachineFunction &MF, MachineDomTree &DT, + std::function FailureCB) { + using MFP = MachineFunctionProperties::Property; + const MachineFunctionProperties &Properties = MF.getProperties(); + bool IsSSA = Properties.hasProperty(MFP::IsSSA); + + MachineConvergenceVerifier CV; + CV.initialize(&errs(), FailureCB, MF, IsSSA); + + for (const auto &MBB : MF) { + CV.visit(MBB); + for (const auto &MI : MBB.instrs()) + CV.visit(MI); + } + + if (CV.sawTokens()) { + DT.recalculate(const_cast(MF)); + CV.verify(DT); + } +} + void MachineVerifier::visitMachineFunctionAfter() { + auto FailureCB = [this](const Twine &Message) { + report(Message.str().c_str(), MF); + }; + verifyConvergenceControl(*MF, DT, FailureCB); + calcRegsPassed(); for (const MachineBasicBlock &MBB : *MF) diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 4172fbc96d1e5..e01cd8cbf925a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -395,6 +395,16 @@ Register FunctionLoweringInfo::CreateRegs(const Value *V) { !TLI->requiresUniformRegister(*MF, V)); } +Register FunctionLoweringInfo::InitializeRegForValue(const Value *V) { + // Tokens live in vregs only when used for convergence control. + if (V->getType()->isTokenTy() && !isa(V)) + return 0; + Register &R = ValueMap[V]; + assert(R == Register() && "Already initialized this value register!"); + assert(VirtReg2Value.empty()); + return R = CreateRegs(V); +} + /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the /// register is a PHI destination and the PHI's LiveOutInfo is not valid. If /// the register's LiveOutInfo is for a smaller bit width, it is extended to diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 032cff416cda9..54409cbf91f1f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -285,6 +285,30 @@ Register InstrEmitter::getVR(SDValue Op, return I->second; } +static bool isConvergenceCtrlMachineOp(SDValue Op) { + if (Op->isMachineOpcode()) { + switch (Op->getMachineOpcode()) { + case TargetOpcode::CONVERGENCECTRL_ANCHOR: + case TargetOpcode::CONVERGENCECTRL_ENTRY: + case TargetOpcode::CONVERGENCECTRL_LOOP: + case TargetOpcode::CONVERGENCECTRL_GLUE: + return true; + } + return false; + } + + // We can reach here when CopyFromReg is encountered. But rather than making a + // special case for that, we just make sure we don't reach here in some + // surprising way. + switch (Op->getOpcode()) { + case ISD::CONVERGENCECTRL_ANCHOR: + case ISD::CONVERGENCECTRL_ENTRY: + case ISD::CONVERGENCECTRL_LOOP: + case ISD::CONVERGENCECTRL_GLUE: + llvm_unreachable("Convergence control should have been selected by now."); + } + return false; +} /// AddRegisterOperand - Add the specified register as an operand to the /// specified machine instr. Insert register copies if the register is @@ -346,9 +370,12 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB, // multiple uses. // Tied operands are never killed, so we need to check that. And that // means we need to determine the index of the operand. - bool isKill = Op.hasOneUse() && - Op.getNode()->getOpcode() != ISD::CopyFromReg && - !IsDebug && + // Don't kill convergence control tokens. Initially they are only used in glue + // nodes, and the InstrEmitter later adds implicit uses on the users of the + // glue node. This can sometimes make it seem like there is only one use, + // which is the glue node itself. + bool isKill = Op.hasOneUse() && !isConvergenceCtrlMachineOp(Op) && + Op.getNode()->getOpcode() != ISD::CopyFromReg && !IsDebug && !(IsClone || IsCloned); if (isKill) { unsigned Idx = MIB->getNumOperands(); @@ -1191,6 +1218,17 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, } } + if (SDNode *GluedNode = Node->getGluedNode()) { + // FIXME: Possibly iterate over multiple glue nodes? + if (GluedNode->getOpcode() == + ~(unsigned)TargetOpcode::CONVERGENCECTRL_GLUE) { + Register VReg = getVR(GluedNode->getOperand(0), VRBaseMap); + MachineOperand MO = MachineOperand::CreateReg(VReg, /*isDef=*/false, + /*isImp=*/true); + MIB->addOperand(MO); + } + } + // Run post-isel target hook to adjust this instruction if needed. if (II.hasPostISelHook()) TLI->AdjustInstrPostInstrSelection(*MIB, Node); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2bdf48643edc3..97d8b48b4bd36 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5065,6 +5065,17 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, // Create the node. SDValue Result; + + if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) { + auto *Token = Bundle->Inputs[0].get(); + SDValue ConvControlToken = getValue(Token); + assert(Ops.back().getValueType() != MVT::Glue && + "Did not expected another glue node here."); + ConvControlToken = + DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken); + Ops.push_back(ConvControlToken); + } + // In some cases, custom collection of operands from CallInst I may be needed. TLI.CollectTargetIntrinsicOperands(I, Ops, DAG); if (IsTgtIntrinsic) { @@ -6065,6 +6076,27 @@ bool SelectionDAGBuilder::visitEntryValueDbgValue( return true; } +/// Lower the call to the specified intrinsic function. +void SelectionDAGBuilder::visitConvergenceControl(const CallInst &I, + unsigned Intrinsic) { + SDLoc sdl = getCurSDLoc(); + switch (Intrinsic) { + case Intrinsic::experimental_convergence_anchor: + setValue(&I, DAG.getNode(ISD::CONVERGENCECTRL_ANCHOR, sdl, MVT::Untyped)); + break; + case Intrinsic::experimental_convergence_entry: + setValue(&I, DAG.getNode(ISD::CONVERGENCECTRL_ENTRY, sdl, MVT::Untyped)); + break; + case Intrinsic::experimental_convergence_loop: { + auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl); + auto *Token = Bundle->Inputs[0].get(); + setValue(&I, DAG.getNode(ISD::CONVERGENCECTRL_LOOP, sdl, MVT::Untyped, + getValue(Token))); + break; + } + } +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -7724,6 +7756,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::experimental_vector_deinterleave2: visitVectorDeinterleave(I); return; + case Intrinsic::experimental_convergence_anchor: + case Intrinsic::experimental_convergence_entry: + case Intrinsic::experimental_convergence_loop: + visitConvergenceControl(I, Intrinsic); } } @@ -8398,6 +8434,14 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, } } + SDValue ConvControlToken; + if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) { + auto *Token = Bundle->Inputs[0].get(); + ConvControlToken = getValue(Token); + } else { + ConvControlToken = DAG.getUNDEF(MVT::Untyped); + } + TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(getCurSDLoc()) .setChain(getRoot()) @@ -8406,7 +8450,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, .setConvergent(CB.isConvergent()) .setIsPreallocated( CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0) - .setCFIType(CFIType); + .setCFIType(CFIType) + .setConvergenceControlToken(ConvControlToken); std::pair Result = lowerInvokable(CLI, EHPadBB); if (Result.first.getNode()) { @@ -8958,7 +9003,8 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { assert(!I.hasOperandBundlesOtherThan( {LLVMContext::OB_deopt, LLVMContext::OB_funclet, LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated, - LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi}) && + LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi, + LLVMContext::OB_convergencectrl}) && "Cannot lower calls with arbitrary operand bundles!"); SDValue Callee = getValue(I.getCalledOperand()); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 47657313cb6a3..9b735672eedfb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -618,6 +618,7 @@ class SelectionDAGBuilder { void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); + void visitConvergenceControl(const CallInst &I, unsigned Intrinsic); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0fbd999694f10..5b8772f413a62 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -165,6 +165,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { if (cast(this)->isOpaque()) return "OpaqueTargetConstant"; return "TargetConstant"; + + // clang-format off + case ISD::TargetConstantFP: return "TargetConstantFP"; case ISD::TargetGlobalAddress: return "TargetGlobalAddress"; case ISD::TargetGlobalTLSAddress: return "TargetGlobalTLSAddress"; @@ -447,6 +450,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SET_FPMODE: return "set_fpmode"; case ISD::RESET_FPMODE: return "reset_fpmode"; + // Convergence control instructions + case ISD::CONVERGENCECTRL_ANCHOR: return "convergencectrl_anchor"; + case ISD::CONVERGENCECTRL_ENTRY: return "convergencectrl_entry"; + case ISD::CONVERGENCECTRL_LOOP: return "convergencectrl_loop"; + // Bit manipulation case ISD::ABS: return "abs"; case ISD::BITREVERSE: return "bitreverse"; @@ -462,6 +470,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::INIT_TRAMPOLINE: return "init_trampoline"; case ISD::ADJUST_TRAMPOLINE: return "adjust_trampoline"; + // clang-format on + case ISD::CONDCODE: switch (cast(this)->get()) { default: llvm_unreachable("Unknown setcc condition!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 9b5ab4267b80e..1c14e4da8e9d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2370,6 +2370,21 @@ void SelectionDAGISel::Select_MEMBARRIER(SDNode *N) { N->getOperand(0)); } +void SelectionDAGISel::Select_CONVERGENCECTRL_ANCHOR(SDNode *N) { + CurDAG->SelectNodeTo(N, TargetOpcode::CONVERGENCECTRL_ANCHOR, + N->getValueType(0)); +} + +void SelectionDAGISel::Select_CONVERGENCECTRL_ENTRY(SDNode *N) { + CurDAG->SelectNodeTo(N, TargetOpcode::CONVERGENCECTRL_ENTRY, + N->getValueType(0)); +} + +void SelectionDAGISel::Select_CONVERGENCECTRL_LOOP(SDNode *N) { + CurDAG->SelectNodeTo(N, TargetOpcode::CONVERGENCECTRL_LOOP, + N->getValueType(0), N->getOperand(0)); +} + void SelectionDAGISel::pushStackMapLiveVariable(SmallVectorImpl &Ops, SDValue OpVal, SDLoc DL) { SDNode *OpNode = OpVal.getNode(); @@ -3117,6 +3132,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::JUMP_TABLE_DEBUG_INFO: Select_JUMP_TABLE_DEBUG_INFO(NodeToMatch); return; + case ISD::CONVERGENCECTRL_ANCHOR: + Select_CONVERGENCECTRL_ANCHOR(NodeToMatch); + return; + case ISD::CONVERGENCECTRL_ENTRY: + Select_CONVERGENCECTRL_ENTRY(NodeToMatch); + return; + case ISD::CONVERGENCECTRL_LOOP: + Select_CONVERGENCECTRL_LOOP(NodeToMatch); + return; } assert(!NodeToMatch->isMachineOpcode() && "Node already selected!"); diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 731fcabaee402..fe4f1fb658ad5 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -627,6 +627,8 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){ switch (Ty->getTypeID()) { default: return MVT::getVT(Ty, HandleUnknown); + case Type::TokenTyID: + return MVT::Untyped; case Type::IntegerTyID: return getIntegerVT(Ty->getContext(), cast(Ty)->getBitWidth()); case Type::FixedVectorTyID: diff --git a/llvm/lib/IR/ConvergenceVerifier.cpp b/llvm/lib/IR/ConvergenceVerifier.cpp index 336c202b6f94c..41361fb9c3066 100644 --- a/llvm/lib/IR/ConvergenceVerifier.cpp +++ b/llvm/lib/IR/ConvergenceVerifier.cpp @@ -14,6 +14,24 @@ using namespace llvm; +template <> +auto GenericConvergenceVerifier::getConvOp(const Instruction &I) + -> ConvOpKind { + const auto *CB = dyn_cast(&I); + if (!CB) + return CONV_NONE; + switch (CB->getIntrinsicID()) { + default: + return CONV_NONE; + case Intrinsic::experimental_convergence_anchor: + return CONV_ANCHOR; + case Intrinsic::experimental_convergence_entry: + return CONV_ENTRY; + case Intrinsic::experimental_convergence_loop: + return CONV_LOOP; + } +} + template <> const Instruction * GenericConvergenceVerifier::findAndCheckConvergenceTokenUsed( @@ -38,11 +56,10 @@ GenericConvergenceVerifier::findAndCheckConvergenceTokenUsed( auto *Token = Bundle->Inputs[0].get(); auto *Def = dyn_cast(Token); - CheckOrNull( - Def && isConvergenceControlIntrinsic(SSAContext::getIntrinsicID(*Def)), - "Convergence control tokens can only be produced by calls to the " - "convergence control intrinsics.", - {Context.print(Token), Context.print(&I)}); + CheckOrNull(Def && getConvOp(*Def) != CONV_NONE, + "Convergence control tokens can only be produced by calls to the " + "convergence control intrinsics.", + {Context.print(Token), Context.print(&I)}); if (Def) Tokens[&I] = Def; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index b04d39c700a8f..f74a621360f88 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -412,7 +412,7 @@ class Verifier : public InstVisitor, VerifierSupport { auto FailureCB = [this](const Twine &Message) { this->CheckFailed(Message); }; - ConvergenceVerifyHelper.initialize(OS, FailureCB, F); + ConvergenceVerifyHelper.initialize(OS, FailureCB, F, /*isSSA=*/true); Broken = false; // FIXME: We strip const here because the inst visitor strips const. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 024adcda0fa06..caba500053652 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2687,7 +2687,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { unsigned IntrID = N->getConstantOperandVal(0); - unsigned Opcode; + unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END; + SDNode *ConvGlueNode = N->getGluedNode(); + if (ConvGlueNode) { + // FIXME: Possibly iterate over multiple glue nodes? + assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE); + ConvGlueNode = ConvGlueNode->getOperand(0).getNode(); + ConvGlueNode = + CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {}, + MVT::Glue, SDValue(ConvGlueNode, 0)); + } else { + ConvGlueNode = nullptr; + } switch (IntrID) { case Intrinsic::amdgcn_wqm: Opcode = AMDGPU::WQM; @@ -2719,11 +2730,19 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { break; default: SelectCode(N); - return; + break; } - SDValue Src = N->getOperand(1); - CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); + if (Opcode != AMDGPU::INSTRUCTION_LIST_END) { + SDValue Src = N->getOperand(1); + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); + } + + if (ConvGlueNode) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps.push_back(SDValue(ConvGlueNode, 0)); + CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps); + } } void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e26b4cf820a52..d61d0a8014073 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -245,6 +245,13 @@ static cl::opt LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); +// Disable structurizer-based control-flow lowering in order to test convergence +// control tokens. This should eventually be replaced by the wave-transform. +static cl::opt DisableStructurizer( + "amdgpu-disable-structurizer", + cl::desc("Disable structurizer for experiments; produces unusable code"), + cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden); + // Enable lib calls simplifications static cl::opt EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -591,6 +598,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; +bool AMDGPUTargetMachine::DisableStructurizer = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -1185,7 +1193,7 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - if (!LateCFGStructurize) { + if (!LateCFGStructurize && !DisableStructurizer) { if (EnableStructurizerWorkarounds) { addPass(createFixIrreduciblePass()); addPass(createUnifyLoopExitsPass()); @@ -1193,7 +1201,7 @@ bool GCNPassConfig::addPreISel() { addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } addPass(createAMDGPUAnnotateUniformValues()); - if (!LateCFGStructurize) { + if (!LateCFGStructurize && !DisableStructurizer) { addPass(createSIAnnotateControlFlowPass()); // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index ce2dd2947daf6..30ab388c7d52e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -37,6 +37,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableLowerModuleLDS; + static bool DisableStructurizer; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5e1d750850374..126c1bd3e991f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -98,6 +98,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); + addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); @@ -3812,6 +3813,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); } + if (!IsTailCall) + Ops.push_back(CLI.ConvergenceControlToken); + if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual @@ -5139,8 +5143,26 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstrBuilder MIB; MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - for (const MachineOperand &MO : MI.operands()) - MIB.add(MO); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + MachineOperand &MO = MI.getOperand(I); + if (I != 2) { + MIB.add(MO); + continue; + } + } + + MachineOperand &MO = MI.getOperand(2); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + // The token operand is always a register, whose definition is IMPLICIT_DEF + // iff there was no token on the call. + if (MachineInstr *Def = MRI.getVRegDef(MO.getReg())) { + if (Def->getOpcode() != TargetOpcode::IMPLICIT_DEF) { + Def->dump(); + MO.dump(); + MO.setImplicit(); + MIB.add(MO); + } + } MIB.cloneMemRefs(MI); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 565af36bc523e..33c93cdf20c43 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -618,8 +618,8 @@ def SI_RETURN : SPseudoInstSI < // This version is only needed so we can fill in the output register // in the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0, unknown:$callee), - [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { + (outs), (ins SSrc_b64:$src0, unknown:$callee, unknown:$token), + [(AMDGPUcall i64:$src0, tglobaladdr:$callee, untyped:$token)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; @@ -629,8 +629,8 @@ def SI_CALL_ISEL : SPseudoInstSI < } def : GCNPat< - (AMDGPUcall i64:$src0, (i64 0)), - (SI_CALL_ISEL $src0, (i64 0)) + (AMDGPUcall i64:$src0, (i64 0), untyped:$token), + (SI_CALL_ISEL $src0, (i64 0), untyped:$token) >; // Wrapper around s_swappc_b64 with extra $callee parameter to track diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll new file mode 100644 index 0000000000000..2ed6d7fd0f598 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -0,0 +1,83 @@ +; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s +; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s + +; CHECK-LABEL: name: basic_call +; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY +; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}} +; DEADMI: {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] +define i32 @basic_call(i32 %src) #0 { + %t = call token @llvm.experimental.convergence.entry() + %r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ] + ret i32 %r +} + +; CHECK-LABEL: name: basic_intrinsic +; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR +; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] +; DEADMI-NOT: CONVERGENCECTRL_GLUE +; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +define i32 @basic_intrinsic(i32 %src) #0 { + %t = call token @llvm.experimental.convergence.anchor() + %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ] + ret i32 %r +} + +; There's nothing to check here. The test is just meant to catch any crashes +; when a convergent call has no token. +define i32 @uncontrolled_call(i32 %src) #0 { + %r = call i32 @foo(i32 %src) + ret i32 %r +} + +; CHECK-LABEL: name: basic_branch +; CHECK: bb.0.entry: +; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR +; CHECK: bb.1.then: +; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] +; DEADMI-NOT: CONVERGENCECTRL_GLUE +; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +define i32 @basic_branch(i32 %src, i1 %cond) #0 { +entry: + %t = call token @llvm.experimental.convergence.anchor() + %x = add i32 %src, 1 + br i1 %cond, label %then, label %else + +then: + %r = call i32 @llvm.amdgcn.readfirstlane(i32 %x) [ "convergencectrl"(token %t) ] + br label %else + +else: + %p = phi i32 [%r, %then], [%x, %entry] + ret i32 %p +} + +; CHECK-LABEL: name: basic_loop +; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR +; CHECK: bb.1.loop: +; CHECK: [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]] +; ISEL: CONVERGENCECTRL_GLUE [[LOOP]] +; DEADMI-NOT: CONVERGENCECTRL_GLUE +; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]] +define i32 @basic_loop(i32 %src, i1 %cond) #0 { + %t1 = call token @llvm.experimental.convergence.anchor() + br label %loop + +loop: + %t2 = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %t1) ] + %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ] + br i1 %cond, label %loop, label %end + +end: + ret i32 %r +} + +declare i32 @foo(i32 %x) #0 + +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +declare token @llvm.experimental.convergence.entry() +declare token @llvm.experimental.convergence.anchor() +declare token @llvm.experimental.convergence.loop() + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll index ab160ffc10ed0..e015095a4884a 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll @@ -92,6 +92,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX11-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr7 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -121,6 +122,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX10-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr7 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -232,6 +234,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -269,6 +272,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -400,6 +404,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -449,6 +454,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -500,6 +506,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -517,6 +524,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -568,6 +576,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -585,6 +594,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -636,6 +646,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -653,6 +664,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -704,6 +716,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -721,6 +734,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -856,6 +870,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr15 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -901,6 +916,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr15 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -2464,6 +2480,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX11-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX11-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX11-NEXT: $vgpr31 = COPY [[COPY132]] + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -2810,6 +2827,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX10-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX10-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX10-NEXT: $vgpr31 = COPY [[COPY132]] + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 6e905542ce53c..8b6b48bcdba0d 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -60,6 +60,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index f70441e87a74b..5f507d482eeb6 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s24, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -43,6 +43,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 @@ -54,7 +55,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -87,6 +88,7 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -146,6 +148,7 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -170,7 +173,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s24, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -185,6 +188,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 @@ -192,7 +196,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -204,7 +208,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s19, s33 +; CHECK-NEXT: s_mov_b32 s25, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -219,6 +223,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 @@ -226,7 +231,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s19 +; CHECK-NEXT: s_mov_b32 s33, s25 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -258,6 +263,7 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 9999cb9173b5d..34e67d0993fb7 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -32,6 +32,7 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index f523b4a2495f1..764f4942cbd03 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,7 +16,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill @@ -150,6 +150,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] +; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v255, 1 @@ -269,7 +270,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -310,7 +311,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -443,6 +444,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] +; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v254, 1 @@ -561,7 +563,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -1528,7 +1530,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1666,6 +1668,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] +; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b64 exec, 1 @@ -1798,7 +1801,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 8c5b89429bcc1..33b5d6c6850bf 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -916,13 +916,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-O0-NEXT: s_getpc_b64 s[20:21] -; WAVE32-O0-NEXT: s_mov_b32 s20, s0 -; WAVE32-O0-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 +; WAVE32-O0-NEXT: s_getpc_b64 s[24:25] +; WAVE32-O0-NEXT: s_mov_b32 s24, s0 +; WAVE32-O0-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9 -; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0 +; WAVE32-O0-NEXT: s_bitset0_b32 s27, 21 +; WAVE32-O0-NEXT: s_add_u32 s24, s24, s9 +; WAVE32-O0-NEXT: s_addc_u32 s25, s25, 0 ; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: s_mov_b32 s14, s8 ; WAVE32-O0-NEXT: s_mov_b32 s13, s7 @@ -934,17 +934,17 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] -; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] +; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[24:25] +; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[26:27] ; WAVE32-O0-NEXT: s_mov_b32 s6, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 ; WAVE32-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1018,10 +1018,11 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 +; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s0, v0, 0 @@ -1136,6 +1137,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 +; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload @@ -1153,13 +1155,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[20:21] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s20, s0 -; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 +; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[24:25] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s0 +; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9 -; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s27, 21 +; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s24, s24, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s25, s25, 0 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 @@ -1172,13 +1174,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], 0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[24:25] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[26:27] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1252,6 +1254,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s1, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s0, v32, 0 @@ -1344,7 +1347,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-O0-NEXT: s_mov_b32 s25, s33 +; WAVE32-O0-NEXT: s_mov_b32 s26, s33 ; WAVE32-O0-NEXT: s_mov_b32 s33, s32 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1358,9 +1361,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1437,10 +1440,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 +; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s5, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 @@ -1456,14 +1460,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-O0-NEXT: s_mov_b32 s33, s25 +; WAVE32-O0-NEXT: s_mov_b32 s33, s26 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-O0: ; %bb.0: ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-O0-NEXT: s_mov_b32 s19, s33 +; WAVE64-O0-NEXT: s_mov_b32 s28, s33 ; WAVE64-O0-NEXT: s_mov_b32 s33, s32 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; WAVE64-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1556,6 +1560,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 +; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload @@ -1575,14 +1580,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 -; WAVE64-O0-NEXT: s_mov_b32 s33, s19 +; WAVE64-O0-NEXT: s_mov_b32 s33, s28 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s25, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1672,6 +1677,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 +; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v32, 0 @@ -1687,7 +1693,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s25 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index bfc249e9081d2..d2364a61ed686 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -233,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -249,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -286,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -356,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -371,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -407,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 7840559c78eb6..364ce82b2e997 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -47,6 +47,7 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_mov_b32 s15, 42 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 7eabe982ff2bc..3a33194f17c87 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -101,6 +101,7 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] +; GCN-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e79cb66dcd776..11f6a2960776b 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -406,6 +406,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 @@ -632,6 +633,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 47c976d2a5c33..6ac61410a0e7d 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -413,6 +413,7 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -656,6 +657,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1283,6 +1285,7 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1526,6 +1529,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll index 58b3ee485ea4b..4e72a5ac5ede3 100644 --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -577,15 +577,15 @@ define double @fcmp_nnan(double %a, double %y, double %z) { ; FP library calls can have fast-math-flags. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' -; FMFDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 -; FMFDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 -; FMFDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 +; FMFDEBUG: ch,glue = PPCISD::CALL_NOP {{t[0-9]+}}, TargetGlobalAddress:i64 +; FMFDEBUG: ch,glue = callseq_end [[T15:t[0-9]+]], TargetConstant:i64<32>, TargetConstant:i64<0>, [[T15]]:1 +; FMFDEBUG: f64,ch,glue = CopyFromReg [[T16:t[0-9]+]], Register:f64 $f1, [[T16]]:1 ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' -; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 -; GLOBALDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 -; GLOBALDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 +; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP {{t[0-9]+}}, TargetGlobalAddress:i64 +; GLOBALDEBUG: ch,glue = callseq_end [[T15:t[0-9]+]], TargetConstant:i64<32>, TargetConstant:i64<0>, [[T15]]:1 +; GLOBALDEBUG: f64,ch,glue = CopyFromReg [[T16:t[0-9]+]], Register:f64 $f1, [[T16]]:1 ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' declare double @log2(double) diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir new file mode 100644 index 0000000000000..94d0ddad25944 --- /dev/null +++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir @@ -0,0 +1,37 @@ +# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +--- +name: basic +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2; + %0:sgpr_64 = CONVERGENCECTRL_ANCHOR + ; CHECK: Entry intrinsic cannot be preceded by a convergent operation in the same basic block. + ; CHECK: CONVERGENCECTRL_ENTRY + %1:sgpr_64 = CONVERGENCECTRL_ENTRY + ; CHECK: Loop intrinsic cannot be preceded by a convergent operation in the same basic block. + ; CHECK: CONVERGENCECTRL_LOOP + %2:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 + S_CBRANCH_EXECZ %bb.1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2; + ; CHECK: Entry intrinsic can occur only in the entry block. + ; CHECK: CONVERGENCECTRL_ENTRY + %5:sgpr_64 = CONVERGENCECTRL_ENTRY + + bb.2: + ; CHECK: Convergence control tokens can only be used by convergent operations. + ; CHECK: G_PHI + %6:sgpr_64 = G_PHI %0:sgpr_64, %bb.0, %0:sgpr_64, %bb.1 + %7:sgpr_64 = CONVERGENCECTRL_ANCHOR + %8:sgpr_64 = IMPLICIT_DEF + %4:sgpr_64 = SI_CALL %8:sgpr_64, 1, implicit %7:sgpr_64 + ; CHECK: An operation can use at most one convergence control token. + ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 2 + %9:sgpr_64 = SI_CALL %8:sgpr_64, 2, implicit %7:sgpr_64, implicit %7:sgpr_64 + ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function. + ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 3 + %10:sgpr_64 = SI_CALL %8:sgpr_64, 3 +... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir new file mode 100644 index 0000000000000..87cf3e604929b --- /dev/null +++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir @@ -0,0 +1,52 @@ +# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +--- +name: cycles +body: | + bb.0: + %0:sgpr_64 = CONVERGENCECTRL_ANCHOR + %1:sgpr_64 = IMPLICIT_DEF + S_CBRANCH_EXECZ %bb.9, implicit $exec + S_BRANCH %bb.1 + + bb.1: + S_CBRANCH_EXECZ %bb.8, implicit $exec + S_BRANCH %bb.5 + + bb.2: + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.4 + + bb.3: + ; CHECK: Cycle heart must dominate all blocks in the cycle. + ; Irreducible cycle: entries(bb.4 bb.3) + %3:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 + S_BRANCH %bb.4 + + bb.4: + S_BRANCH %bb.3 + + bb.5: + S_CBRANCH_EXECZ %bb.6, implicit $exec + S_BRANCH %bb.2 + + bb.6: + S_BRANCH %bb.7 + + bb.7: + ; CHECK: Cycle heart must dominate all blocks in the cycle. + ; Reducible cycle: entries(bb.6) bb.7 + %4:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 + S_BRANCH %bb.6 + + bb.8: + ; CHECK: Two static convergence token uses in a cycle that does not contain either token's definition. + %5:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 + %6:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 + S_BRANCH %bb.8 + + bb.9: + ; CHECK: Convergence token used by an instruction other than llvm.experimental.convergence.loop in a cycle that does not contain the token's definition. + %7:sgpr_64 = G_SI_CALL %1:sgpr_64, 3, implicit %0:sgpr_64 + S_BRANCH %bb.9 + +... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg new file mode 100644 index 0000000000000..7c492428aec76 --- /dev/null +++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AMDGPU" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir new file mode 100644 index 0000000000000..c70a48bf21309 --- /dev/null +++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir @@ -0,0 +1,15 @@ +# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +--- +name: mixed2 +body: | + bb.0: + %0:sgpr_64 = IMPLICIT_DEF + %1:sgpr_64 = SI_CALL %0, 1 + ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function. + ; CHECK: CONVERGENCECTRL_ANCHOR + %2:sgpr_64 = CONVERGENCECTRL_ANCHOR + ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function. + ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 2 + %3:sgpr_64 = SI_CALL %0, 2, implicit %2:sgpr_64 + +... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir new file mode 100644 index 0000000000000..b3834f4f4c571 --- /dev/null +++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir @@ -0,0 +1,11 @@ +# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +--- +name: not_ssa +tracksRegLiveness: true +body: | + bb.0: + ; CHECK: Convergence control requires SSA. + %0:sgpr_64 = CONVERGENCECTRL_ANCHOR + %8:sgpr_64 = IMPLICIT_DEF + %8:sgpr_64 = IMPLICIT_DEF +... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir new file mode 100644 index 0000000000000..9e869acb3e938 --- /dev/null +++ b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir @@ -0,0 +1,24 @@ +# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +--- +name: region_nesting +body: | + bb.0: + %0:sgpr_64 = CONVERGENCECTRL_ANCHOR + %1:sgpr_64 = CONVERGENCECTRL_ANCHOR + %2:sgpr_64 = IMPLICIT_DEF + %3:sgpr_64 = SI_CALL %2, 1, implicit %0:sgpr_64 + ; CHECK: Convergence region is not well-nested. + ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 2 + %4:sgpr_64 = SI_CALL %2, 2, implicit %1:sgpr_64 + S_CBRANCH_EXECZ %bb.1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + %5:sgpr_64 = SI_CALL %2, 3, implicit %0:sgpr_64 + + bb.2: + ; CHECK: Convergence region is not well-nested. + ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 4 + %6:sgpr_64 = SI_CALL %2, 4, implicit %1:sgpr_64 + +... diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td index 622d1df7b381a..40a831d7e9e8f 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td @@ -28,7 +28,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(182), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(186), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562), // CHECK-NEXT: /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(478), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(530), // CHECK-NEXT: // Label 0: @478 diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td index f0ca65a87b76b..751b1318ecc01 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td @@ -34,12 +34,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 3*/ GIMT_Encode4(563), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(477), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(523), -// CHECK-NEXT: // Label 0: @446 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(476), // Rule ID 0 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 3*/ GIMT_Encode4(579), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(493), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(539), +// CHECK-NEXT: // Label 0: @462 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(492), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] a @@ -51,10 +51,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddImm8, /*InsnID*/0, /*Imm*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 4: @476 +// CHECK-NEXT: // Label 4: @492 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @477 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(522), // Rule ID 2 // +// CHECK-NEXT: // Label 1: @493 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(538), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] a @@ -66,10 +66,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/GIMT_Encode8(42), // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 5: @522 +// CHECK-NEXT: // Label 5: @538 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @523 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(562), // Rule ID 1 // +// CHECK-NEXT: // Label 2: @539 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(578), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -83,10 +83,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 6: @562 +// CHECK-NEXT: // Label 6: @578 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @563 +// CHECK-NEXT: // Label 3: @579 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 564 bytes +// CHECK-NEXT: }; // Size: 580 bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td index a446fb72298c2..e8e6d3e74f402 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td @@ -29,7 +29,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(115), GIMT_Encode2(117), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(119), GIMT_Encode2(121), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132), // CHECK-NEXT: /*TargetOpcode::G_INTRINSIC*//*Label 0*/ GIMT_Encode4(18), // CHECK-NEXT: /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4(73), // CHECK-NEXT: // Label 0: @18 diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td index d3c202c4cb01d..26a0ec6235e30 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td @@ -28,7 +28,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(120), GIMT_Encode2(183), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(124), GIMT_Encode2(187), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380), // CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4(262), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4(298), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4(344), diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td index cc77bfdd29c38..83b77519bc73a 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td @@ -37,7 +37,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(69), /*)*//*default:*//*Label 2*/ GIMT_Encode4(88), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(73), /*)*//*default:*//*Label 2*/ GIMT_Encode4(88), // CHECK-NEXT: /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(26), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_BUILD_VECTOR*//*Label 1*/ GIMT_Encode4(57), // CHECK-NEXT: // Label 0: @26 @@ -98,6 +98,6 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIM_Reject, // CHECK-NEXT: // Label 2: @88 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; +// CHECK-NEXT: }; // Size: 89 bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index 57ad0009b5bd6..5cf4e044a0fb8 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -132,15 +132,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // Verify match table. // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 6*/ GIMT_Encode4(661), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(541), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(583), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(608), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(621), -// CHECK-NEXT: // Label 0: @446 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(475), // Rule ID 4 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 6*/ GIMT_Encode4(677), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(504), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(599), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(624), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(637), +// CHECK-NEXT: // Label 0: @462 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(491), // Rule ID 4 // // CHECK-NEXT: GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything), // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled), // CHECK-NEXT: // MIs[0] a @@ -155,8 +155,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // Combiner Rule #3: InstTest1 // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 7: @475 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(487), // Rule ID 3 // +// CHECK-NEXT: // Label 7: @491 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(503), // Rule ID 3 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // Combiner Rule #2: InstTest0 // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner1), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 8: @487 +// CHECK-NEXT: // Label 8: @503 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @488 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(540), // Rule ID 6 // +// CHECK-NEXT: // Label 1: @504 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] dst @@ -186,10 +186,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 9: @540 +// CHECK-NEXT: // Label 9: @556 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @541 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(582), // Rule ID 5 // +// CHECK-NEXT: // Label 2: @557 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(598), // Rule ID 5 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled), // CHECK-NEXT: // MIs[0] tmp // CHECK-NEXT: GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1] @@ -207,32 +207,32 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner2), // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 10: @582 +// CHECK-NEXT: // Label 10: @598 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @583 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(595), // Rule ID 0 // +// CHECK-NEXT: // Label 3: @599 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(611), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 11: @595 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(607), // Rule ID 1 // +// CHECK-NEXT: // Label 11: @611 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(623), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 12: @607 +// CHECK-NEXT: // Label 12: @623 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 4: @608 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(620), // Rule ID 2 // +// CHECK-NEXT: // Label 4: @624 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(636), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 13: @620 +// CHECK-NEXT: // Label 13: @636 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 5: @621 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 // +// CHECK-NEXT: // Label 5: @637 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(676), // Rule ID 7 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled), // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: // No operand predicates @@ -247,10 +247,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 14: @660 +// CHECK-NEXT: // Label 14: @676 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 6: @661 +// CHECK-NEXT: // Label 6: @677 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 662 bytes +// CHECK-NEXT: }; // Size: 678 bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } From 823102ab1e357e84846f03f2d6df5265271061bc Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Feb 2024 04:36:57 +0000 Subject: [PATCH 046/351] [gn build] Port 79889734b940 --- llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn index e78ef13869e64..59df787dbb712 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -124,6 +124,7 @@ static_library("CodeGen") { "MachineCSE.cpp", "MachineCheckDebugify.cpp", "MachineCombiner.cpp", + "MachineConvergenceVerifier.cpp", "MachineCopyPropagation.cpp", "MachineCycleAnalysis.cpp", "MachineDebugify.cpp", From 086280f4d1c085c8e02cd3986bf87529ec7162c5 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 21 Feb 2024 13:07:34 +0800 Subject: [PATCH 047/351] [AMDGPU] Fix linking error of SIISelLowering.cpp.o (NFC) ld.lld: error: undefined symbol: llvm::MachineOperand::dump() const >>> referenced by SIISelLowering.cpp --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 126c1bd3e991f..4697751a4874f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5157,8 +5157,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // iff there was no token on the call. if (MachineInstr *Def = MRI.getVRegDef(MO.getReg())) { if (Def->getOpcode() != TargetOpcode::IMPLICIT_DEF) { - Def->dump(); - MO.dump(); + LLVM_DEBUG({ + Def->dump(); + MO.dump(); + }); MO.setImplicit(); MIB.add(MO); } From e4057aacc52bf8b352898504be8e7f8190841aac Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 20 Feb 2024 21:11:01 -0800 Subject: [PATCH 048/351] [X86] Add missing pass initialization calls. (#82447) If the passes aren't registered, they don't show up in print-after-all. --- llvm/lib/Target/X86/X86TargetMachine.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9e4cf1ea99682..279a1efdff978 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -102,6 +102,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelPass(PR); initializeX86ArgumentStackSlotPassPass(PR); + initializeX86FixupInstTuningPassPass(PR); + initializeX86FixupVectorConstantsPassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { From b8ed69ecc01385c03844e8fa05ba418a5670d322 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Wed, 21 Feb 2024 13:08:01 +0800 Subject: [PATCH 049/351] [RISCV] Support llvm.readsteadycounter intrinsic This intrinsic was introduced by #81331, which is a lot like `llvm.readcyclecounter`. For the RISCV implementation, we rename `ReadCycleWide` pseudo to `ReadCounterWide` and make it accept two operands (the low and high parts of the counter). As for legalization and lowering parts, we reuse the code of `ISD::READCYCLECOUNTER` (make it able to handle both intrinsics), and we use `time` CSR for `ISD::READSTEADYCOUNTER`. Tests using Clang builtins are runned on real hardware and it works as excepted. Reviewers: asb, MaskRay, dtcxzyw, preames, topperc, jhuber6 Reviewed By: jhuber6, asb, MaskRay, dtcxzyw Pull Request: https://github.com/llvm/llvm-project/pull/82322 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 71 +++++++++++++------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 7 +- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 33 +++++---- llvm/test/CodeGen/RISCV/readsteadycounter.ll | 28 ++++++++ 4 files changed, 97 insertions(+), 42 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/readsteadycounter.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 874c851cd9147..87f7813c5d5d5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -625,10 +625,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.is64Bit()) setOperationAction(ISD::Constant, MVT::i64, Custom); - // TODO: On M-mode only targets, the cycle[h] CSR may not be present. + // TODO: On M-mode only targets, the cycle[h]/time[h] CSR may not be present. // Unfortunately this can't be determined just from the ISA naming string. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Subtarget.is64Bit() ? Legal : Custom); + setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, + Subtarget.is64Bit() ? Legal : Custom); setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Legal); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -11724,13 +11726,27 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Result); break; } - case ISD::READCYCLECOUNTER: { - assert(!Subtarget.is64Bit() && - "READCYCLECOUNTER only has custom type legalization on riscv32"); + case ISD::READCYCLECOUNTER: + case ISD::READSTEADYCOUNTER: { + assert(!Subtarget.is64Bit() && "READCYCLECOUNTER/READSTEADYCOUNTER only " + "has custom type legalization on riscv32"); + SDValue LoCounter, HiCounter; + MVT XLenVT = Subtarget.getXLenVT(); + if (N->getOpcode() == ISD::READCYCLECOUNTER) { + LoCounter = DAG.getConstant( + RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding, DL, XLenVT); + HiCounter = DAG.getConstant( + RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding, DL, XLenVT); + } else { + LoCounter = DAG.getConstant( + RISCVSysReg::lookupSysRegByName("TIME")->Encoding, DL, XLenVT); + HiCounter = DAG.getConstant( + RISCVSysReg::lookupSysRegByName("TIMEH")->Encoding, DL, XLenVT); + } SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue RCW = - DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0)); + SDValue RCW = DAG.getNode(RISCVISD::READ_COUNTER_WIDE, DL, VTs, + N->getOperand(0), LoCounter, HiCounter); Results.push_back( DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, RCW, RCW.getValue(1))); @@ -16902,29 +16918,30 @@ RISCVTargetLowering::getTargetConstantFromLoad(LoadSDNode *Ld) const { return CNodeLo->getConstVal(); } -static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, - MachineBasicBlock *BB) { - assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction"); +static MachineBasicBlock *emitReadCounterWidePseudo(MachineInstr &MI, + MachineBasicBlock *BB) { + assert(MI.getOpcode() == RISCV::ReadCounterWide && "Unexpected instruction"); - // To read the 64-bit cycle CSR on a 32-bit target, we read the two halves. + // To read a 64-bit counter CSR on a 32-bit target, we read the two halves. // Should the count have wrapped while it was being read, we need to try // again. - // ... + // For example: + // ``` // read: - // rdcycleh x3 # load high word of cycle - // rdcycle x2 # load low word of cycle - // rdcycleh x4 # load high word of cycle - // bne x3, x4, read # check if high word reads match, otherwise try again - // ... + // csrrs x3, counterh # load high word of counter + // csrrs x2, counter # load low word of counter + // csrrs x4, counterh # load high word of counter + // bne x3, x4, read # check if high word reads match, otherwise try again + // ``` MachineFunction &MF = *BB->getParent(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); + const BasicBlock *LLVMBB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); - MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVMBB); MF.insert(It, LoopMBB); - MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVMBB); MF.insert(It, DoneMBB); // Transfer the remainder of BB and its successor edges to DoneMBB. @@ -16938,17 +16955,19 @@ static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, Register ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); Register LoReg = MI.getOperand(0).getReg(); Register HiReg = MI.getOperand(1).getReg(); + int64_t LoCounter = MI.getOperand(2).getImm(); + int64_t HiCounter = MI.getOperand(3).getImm(); DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg) - .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding) + .addImm(HiCounter) .addReg(RISCV::X0); BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg) - .addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding) + .addImm(LoCounter) .addReg(RISCV::X0); BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg) - .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding) + .addImm(HiCounter) .addReg(RISCV::X0); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) @@ -17527,10 +17546,10 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); - case RISCV::ReadCycleWide: + case RISCV::ReadCounterWide: assert(!Subtarget.is64Bit() && - "ReadCycleWrite is only to be used on riscv32"); - return emitReadCycleWidePseudo(MI, BB); + "ReadCounterWide is only to be used on riscv32"); + return emitReadCounterWidePseudo(MI, BB); case RISCV::Select_GPR_Using_CC_GPR: case RISCV::Select_FPR16_Using_CC_GPR: case RISCV::Select_FPR16INX_Using_CC_GPR: @@ -19202,7 +19221,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FCLASS) NODE_NAME_CASE(FMAX) NODE_NAME_CASE(FMIN) - NODE_NAME_CASE(READ_CYCLE_WIDE) + NODE_NAME_CASE(READ_COUNTER_WIDE) NODE_NAME_CASE(BREV8) NODE_NAME_CASE(ORC_B) NODE_NAME_CASE(ZIP) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 255b1d0e15eed..83b1c68eea61a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -126,9 +126,10 @@ enum NodeType : unsigned { // Floating point fmax and fmin matching the RISC-V instruction semantics. FMAX, FMIN, - // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target - // (returns (Lo, Hi)). It takes a chain operand. - READ_CYCLE_WIDE, + // A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)). + // It takes a chain operand. + READ_COUNTER_WIDE, + // brev8, orc.b, zip, and unzip from Zbb and Zbkb. All operands are i32 or // XLenVT. BREV8, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 7fe9b626b66d6..0d2ffac4883a3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -33,8 +33,10 @@ def SDT_RISCVReadCSR : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>; def SDT_RISCVWriteCSR : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisInt<1>]>; def SDT_RISCVSwapCSR : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; -def SDT_RISCVReadCycleWide : SDTypeProfile<2, 0, [SDTCisVT<0, i32>, - SDTCisVT<1, i32>]>; +def SDT_RISCVReadCounterWide : SDTypeProfile<2, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, + SDTCisInt<2>, + SDTCisInt<3>]>; def SDT_RISCVIntUnaryOpW : SDTypeProfile<1, 1, [ SDTCisSameAs<0, 1>, SDTCisVT<0, i64> ]>; @@ -77,9 +79,9 @@ def riscv_write_csr : SDNode<"RISCVISD::WRITE_CSR", SDT_RISCVWriteCSR, def riscv_swap_csr : SDNode<"RISCVISD::SWAP_CSR", SDT_RISCVSwapCSR, [SDNPHasChain]>; -def riscv_read_cycle_wide : SDNode<"RISCVISD::READ_CYCLE_WIDE", - SDT_RISCVReadCycleWide, - [SDNPHasChain, SDNPSideEffect]>; +def riscv_read_counter_wide : SDNode<"RISCVISD::READ_COUNTER_WIDE", + SDT_RISCVReadCounterWide, + [SDNPHasChain, SDNPSideEffect]>; def riscv_add_lo : SDNode<"RISCVISD::ADD_LO", SDTIntBinOp>; def riscv_hi : SDNode<"RISCVISD::HI", SDTIntUnaryOp>; @@ -363,7 +365,7 @@ def CSRSystemRegister : AsmOperandClass { let DiagnosticType = "InvalidCSRSystemRegister"; } -def csr_sysreg : RISCVOp { +def csr_sysreg : RISCVOp, ImmLeaf(Imm);"> { let ParserMatchClass = CSRSystemRegister; let PrintMethod = "printCSRSystemRegister"; let DecoderMethod = "decodeUImmOperand<12>"; @@ -1827,16 +1829,21 @@ def : StPat; def : StPat; } // Predicates = [IsRV64] +// On RV64, we can directly read these 64-bit counter CSRs. +let Predicates = [IsRV64] in { /// readcyclecounter -// On RV64, we can directly read the 64-bit "cycle" CSR. -let Predicates = [IsRV64] in def : Pat<(i64 (readcyclecounter)), (CSRRS CYCLE.Encoding, (XLenVT X0))>; -// On RV32, ReadCycleWide will be expanded to the suggested loop reading both -// halves of the 64-bit "cycle" CSR. +/// readsteadycounter +def : Pat<(i64 (readsteadycounter)), (CSRRS TIME.Encoding, (XLenVT X0))>; +} + +// On RV32, ReadCounterWide will be expanded to the suggested loop reading both +// halves of 64-bit counter CSRs. let Predicates = [IsRV32], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in -def ReadCycleWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins), - [(set GPR:$lo, GPR:$hi, (riscv_read_cycle_wide))], - "", "">; +def ReadCounterWide : Pseudo<(outs GPR:$lo, GPR:$hi), (ins i32imm:$csr_lo, i32imm:$csr_hi), + [(set GPR:$lo, GPR:$hi, + (riscv_read_counter_wide csr_sysreg:$csr_lo, csr_sysreg:$csr_hi))], + "", "">; /// traps diff --git a/llvm/test/CodeGen/RISCV/readsteadycounter.ll b/llvm/test/CodeGen/RISCV/readsteadycounter.ll new file mode 100644 index 0000000000000..19eab64530c66 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/readsteadycounter.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32I %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64I %s + +; Verify that we lower @llvm.readsteadycounter() correctly. + +declare i64 @llvm.readsteadycounter() + +define i64 @test_builtin_readsteadycounter() nounwind { +; RV32I-LABEL: test_builtin_readsteadycounter: +; RV32I: # %bb.0: +; RV32I-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32I-NEXT: rdtimeh a1 +; RV32I-NEXT: rdtime a0 +; RV32I-NEXT: rdtimeh a2 +; RV32I-NEXT: bne a1, a2, .LBB0_1 +; RV32I-NEXT: # %bb.2: +; RV32I-NEXT: ret +; +; RV64I-LABEL: test_builtin_readsteadycounter: +; RV64I: # %bb.0: +; RV64I-NEXT: rdtime a0 +; RV64I-NEXT: ret + %1 = tail call i64 @llvm.readsteadycounter() + ret i64 %1 +} From a2afcd5721869d1d03c8146bae3885b3385ba15e Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Wed, 21 Feb 2024 11:07:02 +0530 Subject: [PATCH 050/351] Revert "Implement convergence control in MIR using SelectionDAG (#71785)" This reverts commit 79889734b940356ab3381423c93ae06f22e772c9. Encountered multiple buildbot failures. --- .../llvm/ADT/GenericConvergenceVerifier.h | 9 +- .../llvm/CodeGen/FunctionLoweringInfo.h | 10 ++- llvm/include/llvm/CodeGen/ISDOpcodes.h | 9 -- .../llvm/CodeGen/MachineConvergenceVerifier.h | 28 ------ llvm/include/llvm/CodeGen/SelectionDAGISel.h | 4 - llvm/include/llvm/CodeGen/TargetLowering.h | 6 -- .../llvm/IR/GenericConvergenceVerifierImpl.h | 25 +++--- llvm/include/llvm/Support/TargetOpcodes.def | 5 -- llvm/include/llvm/Target/Target.td | 19 ---- .../include/llvm/Target/TargetSelectionDAG.td | 10 --- llvm/lib/CodeGen/CMakeLists.txt | 1 - .../CodeGen/MachineConvergenceVerifier.cpp | 86 ------------------- llvm/lib/CodeGen/MachineVerifier.cpp | 34 -------- .../SelectionDAG/FunctionLoweringInfo.cpp | 10 --- .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 44 +--------- .../SelectionDAG/SelectionDAGBuilder.cpp | 50 +---------- .../SelectionDAG/SelectionDAGBuilder.h | 1 - .../SelectionDAG/SelectionDAGDumper.cpp | 10 --- .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 24 ------ llvm/lib/CodeGen/ValueTypes.cpp | 2 - llvm/lib/IR/ConvergenceVerifier.cpp | 27 ++---- llvm/lib/IR/Verifier.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 27 +----- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 12 +-- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 28 +----- llvm/lib/Target/AMDGPU/SIInstructions.td | 8 +- .../test/CodeGen/AMDGPU/convergence-tokens.ll | 83 ------------------ .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll | 18 ---- .../kernel-vgpr-spill-mubuf-with-voffset.ll | 1 - .../AMDGPU/need-fp-from-vgpr-spills.ll | 18 ++-- .../AMDGPU/no-source-locations-in-prologue.ll | 1 - .../AMDGPU/sgpr-spills-split-regalloc.ll | 15 ++-- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 78 ++++++++--------- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 26 +++--- .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 1 - .../AMDGPU/whole-wave-register-spill.ll | 1 - .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 2 - llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 4 - llvm/test/CodeGen/PowerPC/fmf-propagation.ll | 12 +-- .../convergencectrl/AMDGPU/basic.mir | 37 -------- .../convergencectrl/AMDGPU/cycles.mir | 52 ----------- .../convergencectrl/AMDGPU/lit.local.cfg | 2 - .../convergencectrl/AMDGPU/mixed2.mir | 15 ---- .../convergencectrl/AMDGPU/not-ssa.mir | 11 --- .../convergencectrl/AMDGPU/region-nesting.mir | 24 ------ .../builtins/match-table-replacerreg.td | 2 +- .../match-table-imms.td | 30 +++---- .../match-table-intrinsics.td | 2 +- .../match-table-patfrag-root.td | 2 +- .../match-table-variadics.td | 4 +- .../GlobalISelCombinerEmitter/match-table.td | 62 ++++++------- 52 files changed, 162 insertions(+), 833 deletions(-) delete mode 100644 llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h delete mode 100644 llvm/lib/CodeGen/MachineConvergenceVerifier.cpp delete mode 100644 llvm/test/CodeGen/AMDGPU/convergence-tokens.ll delete mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir delete mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir delete mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg delete mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir delete mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir delete mode 100644 llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir diff --git a/llvm/include/llvm/ADT/GenericConvergenceVerifier.h b/llvm/include/llvm/ADT/GenericConvergenceVerifier.h index d2943cf682f4f..0810a07013229 100644 --- a/llvm/include/llvm/ADT/GenericConvergenceVerifier.h +++ b/llvm/include/llvm/ADT/GenericConvergenceVerifier.h @@ -32,12 +32,11 @@ template class GenericConvergenceVerifier { void initialize(raw_ostream *OS, function_ref FailureCB, - const FunctionT &F, bool _IsSSA) { + const FunctionT &F) { clear(); this->OS = OS; this->FailureCB = FailureCB; Context = ContextT(&F); - IsSSA = _IsSSA; } void clear(); @@ -53,7 +52,6 @@ template class GenericConvergenceVerifier { DominatorTreeT *DT; CycleInfoT CI; ContextT Context; - bool IsSSA; /// Whether the current function has convergencectrl operand bundles. enum { @@ -62,10 +60,6 @@ template class GenericConvergenceVerifier { NoConvergence } ConvergenceKind = NoConvergence; - /// The control token operation performed by a convergence control Intrinsic - /// in LLVM IR, or by a CONVERGENCECTRL* instruction in MIR - enum ConvOpKind { CONV_ANCHOR, CONV_ENTRY, CONV_LOOP, CONV_NONE }; - // Cache token uses found so far. Note that we track the unique definitions // and not the token values. DenseMap Tokens; @@ -74,7 +68,6 @@ template class GenericConvergenceVerifier { static bool isInsideConvergentFunction(const InstructionT &I); static bool isConvergent(const InstructionT &I); - static ConvOpKind getConvOp(const InstructionT &I); const InstructionT *findAndCheckConvergenceTokenUsed(const InstructionT &I); void reportFailure(const Twine &Message, ArrayRef Values); diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h index 31af3014afe4e..cde7247aeb151 100644 --- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -215,7 +215,15 @@ class FunctionLoweringInfo { Register CreateRegs(Type *Ty, bool isDivergent = false); - Register InitializeRegForValue(const Value *V); + Register InitializeRegForValue(const Value *V) { + // Tokens never live in vregs. + if (V->getType()->isTokenTy()) + return 0; + Register &R = ValueMap[V]; + assert(R == 0 && "Already initialized this value register!"); + assert(VirtReg2Value.empty()); + return R = CreateRegs(V); + } /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the /// register is a PHI destination and the PHI's LiveOutInfo is not valid. diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 079abb3a5be3a..8cb0bc9fd9813 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1384,15 +1384,6 @@ enum NodeType { #define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID, #include "llvm/IR/VPIntrinsics.def" - // The `llvm.experimental.convergence.*` intrinsics. - CONVERGENCECTRL_ANCHOR, - CONVERGENCECTRL_ENTRY, - CONVERGENCECTRL_LOOP, - // This does not correspond to any convergence control intrinsic. It used to - // glue a convergence control token to a convergent operation in the DAG, - // which is later translated to an implicit use in the MIR. - CONVERGENCECTRL_GLUE, - /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h b/llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h deleted file mode 100644 index b2faa30816c68..0000000000000 --- a/llvm/include/llvm/CodeGen/MachineConvergenceVerifier.h +++ /dev/null @@ -1,28 +0,0 @@ -//===- MachineConvergenceVerifier.h - Verify convergenctrl ------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// -/// This file declares the MIR specialization of the GenericConvergenceVerifier -/// template. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CODEGEN_MACHINECONVERGENCEVERIFIER_H -#define LLVM_CODEGEN_MACHINECONVERGENCEVERIFIER_H - -#include "llvm/ADT/GenericConvergenceVerifier.h" -#include "llvm/CodeGen/MachineSSAContext.h" - -namespace llvm { - -using MachineConvergenceVerifier = - GenericConvergenceVerifier; - -} // namespace llvm - -#endif // LLVM_CODEGEN_MACHINECONVERGENCEVERIFIER_H diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index 837f8bf7263ea..dbd9b391f4a43 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -459,10 +459,6 @@ class SelectionDAGISel : public MachineFunctionPass { void Select_ARITH_FENCE(SDNode *N); void Select_MEMBARRIER(SDNode *N); - void Select_CONVERGENCECTRL_ANCHOR(SDNode *N); - void Select_CONVERGENCECTRL_ENTRY(SDNode *N); - void Select_CONVERGENCECTRL_LOOP(SDNode *N); - void pushStackMapLiveVariable(SmallVectorImpl &Ops, SDValue Operand, SDLoc DL); void Select_STACKMAP(SDNode *N); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index cbdeaf8b38783..612433b54f6e4 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4401,7 +4401,6 @@ class TargetLowering : public TargetLoweringBase { SmallVector Ins; SmallVector InVals; const ConstantInt *CFIType = nullptr; - SDValue ConvergenceControlToken; CallLoweringInfo(SelectionDAG &DAG) : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false), @@ -4535,11 +4534,6 @@ class TargetLowering : public TargetLoweringBase { return *this; } - CallLoweringInfo &setConvergenceControlToken(SDValue Token) { - ConvergenceControlToken = Token; - return *this; - } - ArgListTy &getArgs() { return Args; } diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h index 9c20aa6499ee8..f6eb5066d5535 100644 --- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h +++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h @@ -52,7 +52,6 @@ template void GenericConvergenceVerifier::clear() { Tokens.clear(); CI.clear(); ConvergenceKind = NoConvergence; - IsSSA = false; } template @@ -62,16 +61,12 @@ void GenericConvergenceVerifier::visit(const BlockT &BB) { template void GenericConvergenceVerifier::visit(const InstructionT &I) { - ConvOpKind ConvOp = getConvOp(I); - if (!IsSSA) { - Check(ConvOp == CONV_NONE, "Convergence control requires SSA.", - {Context.print(&I)}); - return; - } + auto ID = ContextT::getIntrinsicID(I); auto *TokenDef = findAndCheckConvergenceTokenUsed(I); + bool IsCtrlIntrinsic = true; - switch (ConvOp) { - case CONV_ENTRY: + switch (ID) { + case Intrinsic::experimental_convergence_entry: Check(isInsideConvergentFunction(I), "Entry intrinsic can occur only in a convergent function.", {Context.print(&I)}); @@ -83,13 +78,13 @@ void GenericConvergenceVerifier::visit(const InstructionT &I) { "same basic block.", {Context.print(&I)}); LLVM_FALLTHROUGH; - case CONV_ANCHOR: + case Intrinsic::experimental_convergence_anchor: Check(!TokenDef, "Entry or anchor intrinsic cannot have a convergencectrl token " "operand.", {Context.print(&I)}); break; - case CONV_LOOP: + case Intrinsic::experimental_convergence_loop: Check(TokenDef, "Loop intrinsic must have a convergencectrl token operand.", {Context.print(&I)}); Check(!SeenFirstConvOp, @@ -98,13 +93,14 @@ void GenericConvergenceVerifier::visit(const InstructionT &I) { {Context.print(&I)}); break; default: + IsCtrlIntrinsic = false; break; } if (isConvergent(I)) SeenFirstConvOp = true; - if (TokenDef || ConvOp != CONV_NONE) { + if (TokenDef || IsCtrlIntrinsic) { Check(isConvergent(I), "Convergence control token can only be used in a convergent call.", {Context.print(&I)}); @@ -165,7 +161,8 @@ void GenericConvergenceVerifier::verify(const DominatorTreeT &DT) { return; } - Check(getConvOp(*User) == CONV_LOOP, + Check(ContextT::getIntrinsicID(*User) == + Intrinsic::experimental_convergence_loop, "Convergence token used by an instruction other than " "llvm.experimental.convergence.loop in a cycle that does " "not contain the token's definition.", @@ -202,7 +199,7 @@ void GenericConvergenceVerifier::verify(const DominatorTreeT &DT) { for (auto &I : *BB) { if (auto *Token = Tokens.lookup(&I)) checkToken(Token, &I, LiveTokens); - if (getConvOp(I) != CONV_NONE) + if (isConvergenceControlIntrinsic(ContextT::getIntrinsicID(I))) LiveTokens.push_back(&I); } diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 6aded2ceebe13..42cb854d95050 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -225,11 +225,6 @@ HANDLE_TARGET_OPCODE(MEMBARRIER) // using. HANDLE_TARGET_OPCODE(JUMP_TABLE_DEBUG_INFO) -HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ENTRY) -HANDLE_TARGET_OPCODE(CONVERGENCECTRL_ANCHOR) -HANDLE_TARGET_OPCODE(CONVERGENCECTRL_LOOP) -HANDLE_TARGET_OPCODE(CONVERGENCECTRL_GLUE) - /// The following generic opcodes are not supposed to appear after ISel. /// This is something we might want to relax, but for now, this is convenient /// to produce diagnostics. diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 0577c58f8da2d..0d97a47190b19 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1483,25 +1483,6 @@ def JUMP_TABLE_DEBUG_INFO : StandardPseudoInstruction { let isMeta = true; } -let hasSideEffects = false, isMeta = true, isConvergent = true in { -def CONVERGENCECTRL_ANCHOR : StandardPseudoInstruction { - let OutOperandList = (outs unknown:$dst); - let InOperandList = (ins); -} -def CONVERGENCECTRL_ENTRY : StandardPseudoInstruction { - let OutOperandList = (outs unknown:$dst); - let InOperandList = (ins); -} -def CONVERGENCECTRL_LOOP : StandardPseudoInstruction { - let OutOperandList = (outs unknown:$dst); - let InOperandList = (ins unknown:$src); -} -def CONVERGENCECTRL_GLUE : StandardPseudoInstruction { - let OutOperandList = (outs); - let InOperandList = (ins unknown:$src); -} -} - // Generic opcodes used in GlobalISel. include "llvm/Target/GenericOpcodes.td" diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index b33c12a125ce5..5f8bf0d448105 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -782,16 +782,6 @@ def assertsext : SDNode<"ISD::AssertSext", SDT_assert>; def assertzext : SDNode<"ISD::AssertZext", SDT_assert>; def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>; -def convergencectrl_anchor : SDNode<"ISD::CONVERGENCECTRL_ANCHOR", - SDTypeProfile<1, 0, [SDTCisVT<0,untyped>]>>; -def convergencectrl_entry : SDNode<"ISD::CONVERGENCECTRL_ENTRY", - SDTypeProfile<1, 0, [SDTCisVT<0,untyped>]>>; -def convergencectrl_loop : SDNode<"ISD::CONVERGENCECTRL_LOOP", - SDTypeProfile<1, 1, - [SDTCisVT<0,untyped>, SDTCisVT<1,untyped>]>>; -def convergencectrl_glue : SDNode<"ISD::CONVERGENCECTRL_GLUE", - SDTypeProfile<0, 1, [SDTCisVT<0, untyped>]>>; - //===----------------------------------------------------------------------===// // Selection DAG Condition Codes diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 82d665b0691d2..d49bcf8a0c8ee 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -109,7 +109,6 @@ add_llvm_component_library(LLVMCodeGen MachineBranchProbabilityInfo.cpp MachineCFGPrinter.cpp MachineCombiner.cpp - MachineConvergenceVerifier.cpp MachineCopyPropagation.cpp MachineCSE.cpp MachineCheckDebugify.cpp diff --git a/llvm/lib/CodeGen/MachineConvergenceVerifier.cpp b/llvm/lib/CodeGen/MachineConvergenceVerifier.cpp deleted file mode 100644 index 2f384fe6204d1..0000000000000 --- a/llvm/lib/CodeGen/MachineConvergenceVerifier.cpp +++ /dev/null @@ -1,86 +0,0 @@ -//===- ConvergenceVerifier.cpp - Verify convergence control -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// - -#include "llvm/CodeGen/MachineConvergenceVerifier.h" -#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineSSAContext.h" -#include "llvm/IR/GenericConvergenceVerifierImpl.h" - -using namespace llvm; - -template <> -auto GenericConvergenceVerifier::getConvOp( - const MachineInstr &MI) -> ConvOpKind { - switch (MI.getOpcode()) { - default: - return CONV_NONE; - case TargetOpcode::CONVERGENCECTRL_ENTRY: - return CONV_ENTRY; - case TargetOpcode::CONVERGENCECTRL_ANCHOR: - return CONV_ANCHOR; - case TargetOpcode::CONVERGENCECTRL_LOOP: - return CONV_LOOP; - } -} - -template <> -const MachineInstr * -GenericConvergenceVerifier::findAndCheckConvergenceTokenUsed( - const MachineInstr &MI) { - const MachineRegisterInfo &MRI = Context.getFunction()->getRegInfo(); - const MachineInstr *TokenDef = nullptr; - - for (const MachineOperand &MO : MI.uses()) { - if (!MO.isReg()) - continue; - Register OpReg = MO.getReg(); - if (!OpReg.isVirtual()) - continue; - - const MachineInstr *Def = MRI.getVRegDef(OpReg); - if (!Def) - continue; - if (getConvOp(*Def) == CONV_NONE) - continue; - - CheckOrNull( - MI.isConvergent(), - "Convergence control tokens can only be used by convergent operations.", - {Context.print(OpReg), Context.print(&MI)}); - - CheckOrNull(!TokenDef, - "An operation can use at most one convergence control token.", - {Context.print(OpReg), Context.print(&MI)}); - - TokenDef = Def; - } - - if (TokenDef) - Tokens[&MI] = TokenDef; - - return TokenDef; -} - -template <> -bool GenericConvergenceVerifier::isInsideConvergentFunction( - const MachineInstr &MI) { - // The class MachineFunction does not have any property to indicate whether it - // is convergent. Trivially return true so that the check always passes. - return true; -} - -template <> -bool GenericConvergenceVerifier::isConvergent( - const MachineInstr &MI) { - return MI.isConvergent(); -} - -template class llvm::GenericConvergenceVerifier; diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index d1635cbd5bc85..2632b5b9feac9 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -39,8 +39,6 @@ #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineConvergenceVerifier.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -222,11 +220,6 @@ namespace { LiveStacks *LiveStks = nullptr; SlotIndexes *Indexes = nullptr; - // This is calculated only when trying to verify convergence control tokens. - // Similar to the LLVM IR verifier, we calculate this locally instead of - // relying on the pass manager. - MachineDomTree DT; - void visitMachineFunctionBefore(); void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); void visitMachineBundleBefore(const MachineInstr *MI); @@ -2962,34 +2955,7 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) { } } -static void -verifyConvergenceControl(const MachineFunction &MF, MachineDomTree &DT, - std::function FailureCB) { - using MFP = MachineFunctionProperties::Property; - const MachineFunctionProperties &Properties = MF.getProperties(); - bool IsSSA = Properties.hasProperty(MFP::IsSSA); - - MachineConvergenceVerifier CV; - CV.initialize(&errs(), FailureCB, MF, IsSSA); - - for (const auto &MBB : MF) { - CV.visit(MBB); - for (const auto &MI : MBB.instrs()) - CV.visit(MI); - } - - if (CV.sawTokens()) { - DT.recalculate(const_cast(MF)); - CV.verify(DT); - } -} - void MachineVerifier::visitMachineFunctionAfter() { - auto FailureCB = [this](const Twine &Message) { - report(Message.str().c_str(), MF); - }; - verifyConvergenceControl(*MF, DT, FailureCB); - calcRegsPassed(); for (const MachineBasicBlock &MBB : *MF) diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index e01cd8cbf925a..4172fbc96d1e5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -395,16 +395,6 @@ Register FunctionLoweringInfo::CreateRegs(const Value *V) { !TLI->requiresUniformRegister(*MF, V)); } -Register FunctionLoweringInfo::InitializeRegForValue(const Value *V) { - // Tokens live in vregs only when used for convergence control. - if (V->getType()->isTokenTy() && !isa(V)) - return 0; - Register &R = ValueMap[V]; - assert(R == Register() && "Already initialized this value register!"); - assert(VirtReg2Value.empty()); - return R = CreateRegs(V); -} - /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the /// register is a PHI destination and the PHI's LiveOutInfo is not valid. If /// the register's LiveOutInfo is for a smaller bit width, it is extended to diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 54409cbf91f1f..032cff416cda9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -285,30 +285,6 @@ Register InstrEmitter::getVR(SDValue Op, return I->second; } -static bool isConvergenceCtrlMachineOp(SDValue Op) { - if (Op->isMachineOpcode()) { - switch (Op->getMachineOpcode()) { - case TargetOpcode::CONVERGENCECTRL_ANCHOR: - case TargetOpcode::CONVERGENCECTRL_ENTRY: - case TargetOpcode::CONVERGENCECTRL_LOOP: - case TargetOpcode::CONVERGENCECTRL_GLUE: - return true; - } - return false; - } - - // We can reach here when CopyFromReg is encountered. But rather than making a - // special case for that, we just make sure we don't reach here in some - // surprising way. - switch (Op->getOpcode()) { - case ISD::CONVERGENCECTRL_ANCHOR: - case ISD::CONVERGENCECTRL_ENTRY: - case ISD::CONVERGENCECTRL_LOOP: - case ISD::CONVERGENCECTRL_GLUE: - llvm_unreachable("Convergence control should have been selected by now."); - } - return false; -} /// AddRegisterOperand - Add the specified register as an operand to the /// specified machine instr. Insert register copies if the register is @@ -370,12 +346,9 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB, // multiple uses. // Tied operands are never killed, so we need to check that. And that // means we need to determine the index of the operand. - // Don't kill convergence control tokens. Initially they are only used in glue - // nodes, and the InstrEmitter later adds implicit uses on the users of the - // glue node. This can sometimes make it seem like there is only one use, - // which is the glue node itself. - bool isKill = Op.hasOneUse() && !isConvergenceCtrlMachineOp(Op) && - Op.getNode()->getOpcode() != ISD::CopyFromReg && !IsDebug && + bool isKill = Op.hasOneUse() && + Op.getNode()->getOpcode() != ISD::CopyFromReg && + !IsDebug && !(IsClone || IsCloned); if (isKill) { unsigned Idx = MIB->getNumOperands(); @@ -1218,17 +1191,6 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, } } - if (SDNode *GluedNode = Node->getGluedNode()) { - // FIXME: Possibly iterate over multiple glue nodes? - if (GluedNode->getOpcode() == - ~(unsigned)TargetOpcode::CONVERGENCECTRL_GLUE) { - Register VReg = getVR(GluedNode->getOperand(0), VRBaseMap); - MachineOperand MO = MachineOperand::CreateReg(VReg, /*isDef=*/false, - /*isImp=*/true); - MIB->addOperand(MO); - } - } - // Run post-isel target hook to adjust this instruction if needed. if (II.hasPostISelHook()) TLI->AdjustInstrPostInstrSelection(*MIB, Node); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 97d8b48b4bd36..2bdf48643edc3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5065,17 +5065,6 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, // Create the node. SDValue Result; - - if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) { - auto *Token = Bundle->Inputs[0].get(); - SDValue ConvControlToken = getValue(Token); - assert(Ops.back().getValueType() != MVT::Glue && - "Did not expected another glue node here."); - ConvControlToken = - DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken); - Ops.push_back(ConvControlToken); - } - // In some cases, custom collection of operands from CallInst I may be needed. TLI.CollectTargetIntrinsicOperands(I, Ops, DAG); if (IsTgtIntrinsic) { @@ -6076,27 +6065,6 @@ bool SelectionDAGBuilder::visitEntryValueDbgValue( return true; } -/// Lower the call to the specified intrinsic function. -void SelectionDAGBuilder::visitConvergenceControl(const CallInst &I, - unsigned Intrinsic) { - SDLoc sdl = getCurSDLoc(); - switch (Intrinsic) { - case Intrinsic::experimental_convergence_anchor: - setValue(&I, DAG.getNode(ISD::CONVERGENCECTRL_ANCHOR, sdl, MVT::Untyped)); - break; - case Intrinsic::experimental_convergence_entry: - setValue(&I, DAG.getNode(ISD::CONVERGENCECTRL_ENTRY, sdl, MVT::Untyped)); - break; - case Intrinsic::experimental_convergence_loop: { - auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl); - auto *Token = Bundle->Inputs[0].get(); - setValue(&I, DAG.getNode(ISD::CONVERGENCECTRL_LOOP, sdl, MVT::Untyped, - getValue(Token))); - break; - } - } -} - /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -7756,10 +7724,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::experimental_vector_deinterleave2: visitVectorDeinterleave(I); return; - case Intrinsic::experimental_convergence_anchor: - case Intrinsic::experimental_convergence_entry: - case Intrinsic::experimental_convergence_loop: - visitConvergenceControl(I, Intrinsic); } } @@ -8434,14 +8398,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, } } - SDValue ConvControlToken; - if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) { - auto *Token = Bundle->Inputs[0].get(); - ConvControlToken = getValue(Token); - } else { - ConvControlToken = DAG.getUNDEF(MVT::Untyped); - } - TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(getCurSDLoc()) .setChain(getRoot()) @@ -8450,8 +8406,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, .setConvergent(CB.isConvergent()) .setIsPreallocated( CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0) - .setCFIType(CFIType) - .setConvergenceControlToken(ConvControlToken); + .setCFIType(CFIType); std::pair Result = lowerInvokable(CLI, EHPadBB); if (Result.first.getNode()) { @@ -9003,8 +8958,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { assert(!I.hasOperandBundlesOtherThan( {LLVMContext::OB_deopt, LLVMContext::OB_funclet, LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated, - LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi, - LLVMContext::OB_convergencectrl}) && + LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi}) && "Cannot lower calls with arbitrary operand bundles!"); SDValue Callee = getValue(I.getCalledOperand()); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 9b735672eedfb..47657313cb6a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -618,7 +618,6 @@ class SelectionDAGBuilder { void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); - void visitConvergenceControl(const CallInst &I, unsigned Intrinsic); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 5b8772f413a62..0fbd999694f10 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -165,9 +165,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { if (cast(this)->isOpaque()) return "OpaqueTargetConstant"; return "TargetConstant"; - - // clang-format off - case ISD::TargetConstantFP: return "TargetConstantFP"; case ISD::TargetGlobalAddress: return "TargetGlobalAddress"; case ISD::TargetGlobalTLSAddress: return "TargetGlobalTLSAddress"; @@ -450,11 +447,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SET_FPMODE: return "set_fpmode"; case ISD::RESET_FPMODE: return "reset_fpmode"; - // Convergence control instructions - case ISD::CONVERGENCECTRL_ANCHOR: return "convergencectrl_anchor"; - case ISD::CONVERGENCECTRL_ENTRY: return "convergencectrl_entry"; - case ISD::CONVERGENCECTRL_LOOP: return "convergencectrl_loop"; - // Bit manipulation case ISD::ABS: return "abs"; case ISD::BITREVERSE: return "bitreverse"; @@ -470,8 +462,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::INIT_TRAMPOLINE: return "init_trampoline"; case ISD::ADJUST_TRAMPOLINE: return "adjust_trampoline"; - // clang-format on - case ISD::CONDCODE: switch (cast(this)->get()) { default: llvm_unreachable("Unknown setcc condition!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 1c14e4da8e9d3..9b5ab4267b80e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2370,21 +2370,6 @@ void SelectionDAGISel::Select_MEMBARRIER(SDNode *N) { N->getOperand(0)); } -void SelectionDAGISel::Select_CONVERGENCECTRL_ANCHOR(SDNode *N) { - CurDAG->SelectNodeTo(N, TargetOpcode::CONVERGENCECTRL_ANCHOR, - N->getValueType(0)); -} - -void SelectionDAGISel::Select_CONVERGENCECTRL_ENTRY(SDNode *N) { - CurDAG->SelectNodeTo(N, TargetOpcode::CONVERGENCECTRL_ENTRY, - N->getValueType(0)); -} - -void SelectionDAGISel::Select_CONVERGENCECTRL_LOOP(SDNode *N) { - CurDAG->SelectNodeTo(N, TargetOpcode::CONVERGENCECTRL_LOOP, - N->getValueType(0), N->getOperand(0)); -} - void SelectionDAGISel::pushStackMapLiveVariable(SmallVectorImpl &Ops, SDValue OpVal, SDLoc DL) { SDNode *OpNode = OpVal.getNode(); @@ -3132,15 +3117,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::JUMP_TABLE_DEBUG_INFO: Select_JUMP_TABLE_DEBUG_INFO(NodeToMatch); return; - case ISD::CONVERGENCECTRL_ANCHOR: - Select_CONVERGENCECTRL_ANCHOR(NodeToMatch); - return; - case ISD::CONVERGENCECTRL_ENTRY: - Select_CONVERGENCECTRL_ENTRY(NodeToMatch); - return; - case ISD::CONVERGENCECTRL_LOOP: - Select_CONVERGENCECTRL_LOOP(NodeToMatch); - return; } assert(!NodeToMatch->isMachineOpcode() && "Node already selected!"); diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index fe4f1fb658ad5..731fcabaee402 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -627,8 +627,6 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){ switch (Ty->getTypeID()) { default: return MVT::getVT(Ty, HandleUnknown); - case Type::TokenTyID: - return MVT::Untyped; case Type::IntegerTyID: return getIntegerVT(Ty->getContext(), cast(Ty)->getBitWidth()); case Type::FixedVectorTyID: diff --git a/llvm/lib/IR/ConvergenceVerifier.cpp b/llvm/lib/IR/ConvergenceVerifier.cpp index 41361fb9c3066..336c202b6f94c 100644 --- a/llvm/lib/IR/ConvergenceVerifier.cpp +++ b/llvm/lib/IR/ConvergenceVerifier.cpp @@ -14,24 +14,6 @@ using namespace llvm; -template <> -auto GenericConvergenceVerifier::getConvOp(const Instruction &I) - -> ConvOpKind { - const auto *CB = dyn_cast(&I); - if (!CB) - return CONV_NONE; - switch (CB->getIntrinsicID()) { - default: - return CONV_NONE; - case Intrinsic::experimental_convergence_anchor: - return CONV_ANCHOR; - case Intrinsic::experimental_convergence_entry: - return CONV_ENTRY; - case Intrinsic::experimental_convergence_loop: - return CONV_LOOP; - } -} - template <> const Instruction * GenericConvergenceVerifier::findAndCheckConvergenceTokenUsed( @@ -56,10 +38,11 @@ GenericConvergenceVerifier::findAndCheckConvergenceTokenUsed( auto *Token = Bundle->Inputs[0].get(); auto *Def = dyn_cast(Token); - CheckOrNull(Def && getConvOp(*Def) != CONV_NONE, - "Convergence control tokens can only be produced by calls to the " - "convergence control intrinsics.", - {Context.print(Token), Context.print(&I)}); + CheckOrNull( + Def && isConvergenceControlIntrinsic(SSAContext::getIntrinsicID(*Def)), + "Convergence control tokens can only be produced by calls to the " + "convergence control intrinsics.", + {Context.print(Token), Context.print(&I)}); if (Def) Tokens[&I] = Def; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index f74a621360f88..b04d39c700a8f 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -412,7 +412,7 @@ class Verifier : public InstVisitor, VerifierSupport { auto FailureCB = [this](const Twine &Message) { this->CheckFailed(Message); }; - ConvergenceVerifyHelper.initialize(OS, FailureCB, F, /*isSSA=*/true); + ConvergenceVerifyHelper.initialize(OS, FailureCB, F); Broken = false; // FIXME: We strip const here because the inst visitor strips const. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index caba500053652..024adcda0fa06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2687,18 +2687,7 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { unsigned IntrID = N->getConstantOperandVal(0); - unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END; - SDNode *ConvGlueNode = N->getGluedNode(); - if (ConvGlueNode) { - // FIXME: Possibly iterate over multiple glue nodes? - assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE); - ConvGlueNode = ConvGlueNode->getOperand(0).getNode(); - ConvGlueNode = - CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {}, - MVT::Glue, SDValue(ConvGlueNode, 0)); - } else { - ConvGlueNode = nullptr; - } + unsigned Opcode; switch (IntrID) { case Intrinsic::amdgcn_wqm: Opcode = AMDGPU::WQM; @@ -2730,19 +2719,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { break; default: SelectCode(N); - break; - } - - if (Opcode != AMDGPU::INSTRUCTION_LIST_END) { - SDValue Src = N->getOperand(1); - CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); + return; } - if (ConvGlueNode) { - SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps.push_back(SDValue(ConvGlueNode, 0)); - CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps); - } + SDValue Src = N->getOperand(1); + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); } void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d61d0a8014073..e26b4cf820a52 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -245,13 +245,6 @@ static cl::opt LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -// Disable structurizer-based control-flow lowering in order to test convergence -// control tokens. This should eventually be replaced by the wave-transform. -static cl::opt DisableStructurizer( - "amdgpu-disable-structurizer", - cl::desc("Disable structurizer for experiments; produces unusable code"), - cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden); - // Enable lib calls simplifications static cl::opt EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -598,7 +591,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; -bool AMDGPUTargetMachine::DisableStructurizer = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -1193,7 +1185,7 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - if (!LateCFGStructurize && !DisableStructurizer) { + if (!LateCFGStructurize) { if (EnableStructurizerWorkarounds) { addPass(createFixIrreduciblePass()); addPass(createUnifyLoopExitsPass()); @@ -1201,7 +1193,7 @@ bool GCNPassConfig::addPreISel() { addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } addPass(createAMDGPUAnnotateUniformValues()); - if (!LateCFGStructurize && !DisableStructurizer) { + if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 30ab388c7d52e..ce2dd2947daf6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -37,7 +37,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableLowerModuleLDS; - static bool DisableStructurizer; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4697751a4874f..5e1d750850374 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -98,7 +98,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); - addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); @@ -3813,9 +3812,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); } - if (!IsTailCall) - Ops.push_back(CLI.ConvergenceControlToken); - if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual @@ -5143,28 +5139,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstrBuilder MIB; MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &MO = MI.getOperand(I); - if (I != 2) { - MIB.add(MO); - continue; - } - } - - MachineOperand &MO = MI.getOperand(2); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - // The token operand is always a register, whose definition is IMPLICIT_DEF - // iff there was no token on the call. - if (MachineInstr *Def = MRI.getVRegDef(MO.getReg())) { - if (Def->getOpcode() != TargetOpcode::IMPLICIT_DEF) { - LLVM_DEBUG({ - Def->dump(); - MO.dump(); - }); - MO.setImplicit(); - MIB.add(MO); - } - } + for (const MachineOperand &MO : MI.operands()) + MIB.add(MO); MIB.cloneMemRefs(MI); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 33c93cdf20c43..565af36bc523e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -618,8 +618,8 @@ def SI_RETURN : SPseudoInstSI < // This version is only needed so we can fill in the output register // in the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0, unknown:$callee, unknown:$token), - [(AMDGPUcall i64:$src0, tglobaladdr:$callee, untyped:$token)]> { + (outs), (ins SSrc_b64:$src0, unknown:$callee), + [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; @@ -629,8 +629,8 @@ def SI_CALL_ISEL : SPseudoInstSI < } def : GCNPat< - (AMDGPUcall i64:$src0, (i64 0), untyped:$token), - (SI_CALL_ISEL $src0, (i64 0), untyped:$token) + (AMDGPUcall i64:$src0, (i64 0)), + (SI_CALL_ISEL $src0, (i64 0)) >; // Wrapper around s_swappc_b64 with extra $callee parameter to track diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll deleted file mode 100644 index 2ed6d7fd0f598..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s -; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s - -; CHECK-LABEL: name: basic_call -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY -; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}} -; DEADMI: {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] -define i32 @basic_call(i32 %src) #0 { - %t = call token @llvm.experimental.convergence.entry() - %r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ] - ret i32 %r -} - -; CHECK-LABEL: name: basic_intrinsic -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] -; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] -define i32 @basic_intrinsic(i32 %src) #0 { - %t = call token @llvm.experimental.convergence.anchor() - %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ] - ret i32 %r -} - -; There's nothing to check here. The test is just meant to catch any crashes -; when a convergent call has no token. -define i32 @uncontrolled_call(i32 %src) #0 { - %r = call i32 @foo(i32 %src) - ret i32 %r -} - -; CHECK-LABEL: name: basic_branch -; CHECK: bb.0.entry: -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; CHECK: bb.1.then: -; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] -; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] -define i32 @basic_branch(i32 %src, i1 %cond) #0 { -entry: - %t = call token @llvm.experimental.convergence.anchor() - %x = add i32 %src, 1 - br i1 %cond, label %then, label %else - -then: - %r = call i32 @llvm.amdgcn.readfirstlane(i32 %x) [ "convergencectrl"(token %t) ] - br label %else - -else: - %p = phi i32 [%r, %then], [%x, %entry] - ret i32 %p -} - -; CHECK-LABEL: name: basic_loop -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; CHECK: bb.1.loop: -; CHECK: [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]] -; ISEL: CONVERGENCECTRL_GLUE [[LOOP]] -; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]] -define i32 @basic_loop(i32 %src, i1 %cond) #0 { - %t1 = call token @llvm.experimental.convergence.anchor() - br label %loop - -loop: - %t2 = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %t1) ] - %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ] - br i1 %cond, label %loop, label %end - -end: - ret i32 %r -} - -declare i32 @foo(i32 %x) #0 - -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 - -declare token @llvm.experimental.convergence.entry() -declare token @llvm.experimental.convergence.anchor() -declare token @llvm.experimental.convergence.loop() - -attributes #0 = { nounwind readnone convergent } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll index e015095a4884a..ab160ffc10ed0 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll @@ -92,7 +92,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX11-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr7 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -122,7 +121,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX10-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr7 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -234,7 +232,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -272,7 +269,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -404,7 +400,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -454,7 +449,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -506,7 +500,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -524,7 +517,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -576,7 +568,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -594,7 +585,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -646,7 +636,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -664,7 +653,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -716,7 +704,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -734,7 +721,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -870,7 +856,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr15 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -916,7 +901,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr15 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -2480,7 +2464,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX11-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX11-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX11-NEXT: $vgpr31 = COPY [[COPY132]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -2827,7 +2810,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX10-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX10-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX10-NEXT: $vgpr31 = COPY [[COPY132]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 8b6b48bcdba0d..6e905542ce53c 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -60,7 +60,6 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 5f507d482eeb6..f70441e87a74b 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -43,7 +43,6 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 @@ -55,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -88,7 +87,6 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -148,7 +146,6 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -173,7 +170,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -188,7 +185,6 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 @@ -196,7 +192,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -208,7 +204,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s25, s33 +; CHECK-NEXT: s_mov_b32 s19, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -223,7 +219,6 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 @@ -231,7 +226,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s25 +; CHECK-NEXT: s_mov_b32 s33, s19 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -263,7 +258,6 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 34e67d0993fb7..9999cb9173b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -32,7 +32,6 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 764f4942cbd03..f523b4a2495f1 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,7 +16,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill @@ -150,7 +150,6 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v255, 1 @@ -270,7 +269,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -311,7 +310,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -444,7 +443,6 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v254, 1 @@ -563,7 +561,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -1530,7 +1528,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1668,7 +1666,6 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b64 exec, 1 @@ -1801,7 +1798,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 33b5d6c6850bf..8c5b89429bcc1 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -916,13 +916,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-O0-NEXT: s_getpc_b64 s[24:25] -; WAVE32-O0-NEXT: s_mov_b32 s24, s0 -; WAVE32-O0-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 +; WAVE32-O0-NEXT: s_getpc_b64 s[20:21] +; WAVE32-O0-NEXT: s_mov_b32 s20, s0 +; WAVE32-O0-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-O0-NEXT: s_bitset0_b32 s27, 21 -; WAVE32-O0-NEXT: s_add_u32 s24, s24, s9 -; WAVE32-O0-NEXT: s_addc_u32 s25, s25, 0 +; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0 ; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: s_mov_b32 s14, s8 ; WAVE32-O0-NEXT: s_mov_b32 s13, s7 @@ -934,17 +934,17 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[24:25] -; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[26:27] +; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-O0-NEXT: s_mov_b32 s6, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 ; WAVE32-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1018,11 +1018,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s0, v0, 0 @@ -1137,7 +1136,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload @@ -1155,13 +1153,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[24:25] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s0 -; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 +; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[20:21] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s20, s0 +; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s27, 21 -; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s24, s24, s9 -; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s25, s25, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 @@ -1174,13 +1172,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[24:25] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[26:27] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1254,7 +1252,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s1, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s0, v32, 0 @@ -1347,7 +1344,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-O0-NEXT: s_mov_b32 s26, s33 +; WAVE32-O0-NEXT: s_mov_b32 s25, s33 ; WAVE32-O0-NEXT: s_mov_b32 s33, s32 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1361,9 +1358,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1440,11 +1437,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s5, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 @@ -1460,14 +1456,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-O0-NEXT: s_mov_b32 s33, s26 +; WAVE32-O0-NEXT: s_mov_b32 s33, s25 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-O0: ; %bb.0: ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-O0-NEXT: s_mov_b32 s28, s33 +; WAVE64-O0-NEXT: s_mov_b32 s19, s33 ; WAVE64-O0-NEXT: s_mov_b32 s33, s32 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; WAVE64-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1560,7 +1556,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload @@ -1580,14 +1575,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 -; WAVE64-O0-NEXT: s_mov_b32 s33, s28 +; WAVE64-O0-NEXT: s_mov_b32 s33, s19 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s25, s33 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1677,7 +1672,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v32, 0 @@ -1693,7 +1687,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s25 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index d2364a61ed686..bfc249e9081d2 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -233,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -249,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -286,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -356,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -371,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -407,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 364ce82b2e997..7840559c78eb6 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -47,7 +47,6 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_mov_b32 s15, 42 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 3a33194f17c87..7eabe982ff2bc 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -101,7 +101,6 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 11f6a2960776b..e79cb66dcd776 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -406,7 +406,6 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 @@ -633,7 +632,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 6ac61410a0e7d..47c976d2a5c33 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -413,7 +413,6 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -657,7 +656,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1285,7 +1283,6 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1529,7 +1526,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll index 4e72a5ac5ede3..58b3ee485ea4b 100644 --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -577,15 +577,15 @@ define double @fcmp_nnan(double %a, double %y, double %z) { ; FP library calls can have fast-math-flags. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' -; FMFDEBUG: ch,glue = PPCISD::CALL_NOP {{t[0-9]+}}, TargetGlobalAddress:i64 -; FMFDEBUG: ch,glue = callseq_end [[T15:t[0-9]+]], TargetConstant:i64<32>, TargetConstant:i64<0>, [[T15]]:1 -; FMFDEBUG: f64,ch,glue = CopyFromReg [[T16:t[0-9]+]], Register:f64 $f1, [[T16]]:1 +; FMFDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 +; FMFDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 +; FMFDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' -; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP {{t[0-9]+}}, TargetGlobalAddress:i64 -; GLOBALDEBUG: ch,glue = callseq_end [[T15:t[0-9]+]], TargetConstant:i64<32>, TargetConstant:i64<0>, [[T15]]:1 -; GLOBALDEBUG: f64,ch,glue = CopyFromReg [[T16:t[0-9]+]], Register:f64 $f1, [[T16]]:1 +; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 +; GLOBALDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 +; GLOBALDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' declare double @log2(double) diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir deleted file mode 100644 index 94d0ddad25944..0000000000000 --- a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/basic.mir +++ /dev/null @@ -1,37 +0,0 @@ -# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s ---- -name: basic -tracksRegLiveness: true -body: | - bb.0: - successors: %bb.1, %bb.2; - %0:sgpr_64 = CONVERGENCECTRL_ANCHOR - ; CHECK: Entry intrinsic cannot be preceded by a convergent operation in the same basic block. - ; CHECK: CONVERGENCECTRL_ENTRY - %1:sgpr_64 = CONVERGENCECTRL_ENTRY - ; CHECK: Loop intrinsic cannot be preceded by a convergent operation in the same basic block. - ; CHECK: CONVERGENCECTRL_LOOP - %2:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 - S_CBRANCH_EXECZ %bb.1, implicit $exec - S_BRANCH %bb.2 - - bb.1: - successors: %bb.2; - ; CHECK: Entry intrinsic can occur only in the entry block. - ; CHECK: CONVERGENCECTRL_ENTRY - %5:sgpr_64 = CONVERGENCECTRL_ENTRY - - bb.2: - ; CHECK: Convergence control tokens can only be used by convergent operations. - ; CHECK: G_PHI - %6:sgpr_64 = G_PHI %0:sgpr_64, %bb.0, %0:sgpr_64, %bb.1 - %7:sgpr_64 = CONVERGENCECTRL_ANCHOR - %8:sgpr_64 = IMPLICIT_DEF - %4:sgpr_64 = SI_CALL %8:sgpr_64, 1, implicit %7:sgpr_64 - ; CHECK: An operation can use at most one convergence control token. - ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 2 - %9:sgpr_64 = SI_CALL %8:sgpr_64, 2, implicit %7:sgpr_64, implicit %7:sgpr_64 - ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function. - ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 3 - %10:sgpr_64 = SI_CALL %8:sgpr_64, 3 -... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir deleted file mode 100644 index 87cf3e604929b..0000000000000 --- a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/cycles.mir +++ /dev/null @@ -1,52 +0,0 @@ -# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s ---- -name: cycles -body: | - bb.0: - %0:sgpr_64 = CONVERGENCECTRL_ANCHOR - %1:sgpr_64 = IMPLICIT_DEF - S_CBRANCH_EXECZ %bb.9, implicit $exec - S_BRANCH %bb.1 - - bb.1: - S_CBRANCH_EXECZ %bb.8, implicit $exec - S_BRANCH %bb.5 - - bb.2: - S_CBRANCH_EXECZ %bb.3, implicit $exec - S_BRANCH %bb.4 - - bb.3: - ; CHECK: Cycle heart must dominate all blocks in the cycle. - ; Irreducible cycle: entries(bb.4 bb.3) - %3:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 - S_BRANCH %bb.4 - - bb.4: - S_BRANCH %bb.3 - - bb.5: - S_CBRANCH_EXECZ %bb.6, implicit $exec - S_BRANCH %bb.2 - - bb.6: - S_BRANCH %bb.7 - - bb.7: - ; CHECK: Cycle heart must dominate all blocks in the cycle. - ; Reducible cycle: entries(bb.6) bb.7 - %4:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 - S_BRANCH %bb.6 - - bb.8: - ; CHECK: Two static convergence token uses in a cycle that does not contain either token's definition. - %5:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 - %6:sgpr_64 = CONVERGENCECTRL_LOOP %0:sgpr_64 - S_BRANCH %bb.8 - - bb.9: - ; CHECK: Convergence token used by an instruction other than llvm.experimental.convergence.loop in a cycle that does not contain the token's definition. - %7:sgpr_64 = G_SI_CALL %1:sgpr_64, 3, implicit %0:sgpr_64 - S_BRANCH %bb.9 - -... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg deleted file mode 100644 index 7c492428aec76..0000000000000 --- a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not "AMDGPU" in config.root.targets: - config.unsupported = True diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir deleted file mode 100644 index c70a48bf21309..0000000000000 --- a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/mixed2.mir +++ /dev/null @@ -1,15 +0,0 @@ -# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s ---- -name: mixed2 -body: | - bb.0: - %0:sgpr_64 = IMPLICIT_DEF - %1:sgpr_64 = SI_CALL %0, 1 - ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function. - ; CHECK: CONVERGENCECTRL_ANCHOR - %2:sgpr_64 = CONVERGENCECTRL_ANCHOR - ; CHECK: Cannot mix controlled and uncontrolled convergence in the same function. - ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 2 - %3:sgpr_64 = SI_CALL %0, 2, implicit %2:sgpr_64 - -... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir deleted file mode 100644 index b3834f4f4c571..0000000000000 --- a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/not-ssa.mir +++ /dev/null @@ -1,11 +0,0 @@ -# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s ---- -name: not_ssa -tracksRegLiveness: true -body: | - bb.0: - ; CHECK: Convergence control requires SSA. - %0:sgpr_64 = CONVERGENCECTRL_ANCHOR - %8:sgpr_64 = IMPLICIT_DEF - %8:sgpr_64 = IMPLICIT_DEF -... diff --git a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir b/llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir deleted file mode 100644 index 9e869acb3e938..0000000000000 --- a/llvm/test/MachineVerifier/convergencectrl/AMDGPU/region-nesting.mir +++ /dev/null @@ -1,24 +0,0 @@ -# RUN: not --crash llc -march=amdgcn -run-pass=none -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s ---- -name: region_nesting -body: | - bb.0: - %0:sgpr_64 = CONVERGENCECTRL_ANCHOR - %1:sgpr_64 = CONVERGENCECTRL_ANCHOR - %2:sgpr_64 = IMPLICIT_DEF - %3:sgpr_64 = SI_CALL %2, 1, implicit %0:sgpr_64 - ; CHECK: Convergence region is not well-nested. - ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 2 - %4:sgpr_64 = SI_CALL %2, 2, implicit %1:sgpr_64 - S_CBRANCH_EXECZ %bb.1, implicit $exec - S_BRANCH %bb.2 - - bb.1: - %5:sgpr_64 = SI_CALL %2, 3, implicit %0:sgpr_64 - - bb.2: - ; CHECK: Convergence region is not well-nested. - ; CHECK: SI_CALL %{{[0-9]}}:sgpr_64, 4 - %6:sgpr_64 = SI_CALL %2, 4, implicit %1:sgpr_64 - -... diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td index 40a831d7e9e8f..622d1df7b381a 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td @@ -28,7 +28,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(186), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(182), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562), // CHECK-NEXT: /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(478), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(530), // CHECK-NEXT: // Label 0: @478 diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td index 751b1318ecc01..f0ca65a87b76b 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td @@ -34,12 +34,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 3*/ GIMT_Encode4(579), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(493), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(539), -// CHECK-NEXT: // Label 0: @462 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(492), // Rule ID 0 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 3*/ GIMT_Encode4(563), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(477), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(523), +// CHECK-NEXT: // Label 0: @446 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(476), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] a @@ -51,10 +51,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddImm8, /*InsnID*/0, /*Imm*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 4: @492 +// CHECK-NEXT: // Label 4: @476 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @493 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(538), // Rule ID 2 // +// CHECK-NEXT: // Label 1: @477 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(522), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] a @@ -66,10 +66,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/GIMT_Encode8(42), // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 5: @538 +// CHECK-NEXT: // Label 5: @522 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @539 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(578), // Rule ID 1 // +// CHECK-NEXT: // Label 2: @523 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(562), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -83,10 +83,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 6: @578 +// CHECK-NEXT: // Label 6: @562 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @579 +// CHECK-NEXT: // Label 3: @563 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 580 bytes +// CHECK-NEXT: }; // Size: 564 bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td index e8e6d3e74f402..a446fb72298c2 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td @@ -29,7 +29,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(119), GIMT_Encode2(121), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(115), GIMT_Encode2(117), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132), // CHECK-NEXT: /*TargetOpcode::G_INTRINSIC*//*Label 0*/ GIMT_Encode4(18), // CHECK-NEXT: /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4(73), // CHECK-NEXT: // Label 0: @18 diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td index 26a0ec6235e30..d3c202c4cb01d 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td @@ -28,7 +28,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(124), GIMT_Encode2(187), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(120), GIMT_Encode2(183), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380), // CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4(262), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4(298), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4(344), diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td index 83b77519bc73a..cc77bfdd29c38 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-variadics.td @@ -37,7 +37,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(73), /*)*//*default:*//*Label 2*/ GIMT_Encode4(88), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(65), GIMT_Encode2(69), /*)*//*default:*//*Label 2*/ GIMT_Encode4(88), // CHECK-NEXT: /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(26), GIMT_Encode4(0), GIMT_Encode4(0), // CHECK-NEXT: /*TargetOpcode::G_BUILD_VECTOR*//*Label 1*/ GIMT_Encode4(57), // CHECK-NEXT: // Label 0: @26 @@ -98,6 +98,6 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIM_Reject, // CHECK-NEXT: // Label 2: @88 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 89 bytes +// CHECK-NEXT: }; // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index 5cf4e044a0fb8..57ad0009b5bd6 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -132,15 +132,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // Verify match table. // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 6*/ GIMT_Encode4(677), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(504), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(599), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(624), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(637), -// CHECK-NEXT: // Label 0: @462 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(491), // Rule ID 4 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(128), /*)*//*default:*//*Label 6*/ GIMT_Encode4(661), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(446), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(541), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(583), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(608), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(621), +// CHECK-NEXT: // Label 0: @446 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(475), // Rule ID 4 // // CHECK-NEXT: GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything), // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled), // CHECK-NEXT: // MIs[0] a @@ -155,8 +155,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // Combiner Rule #3: InstTest1 // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 7: @491 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(503), // Rule ID 3 // +// CHECK-NEXT: // Label 7: @475 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(487), // Rule ID 3 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -165,10 +165,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // Combiner Rule #2: InstTest0 // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner1), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 8: @503 +// CHECK-NEXT: // Label 8: @487 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @504 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 // +// CHECK-NEXT: // Label 1: @488 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(540), // Rule ID 6 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] dst @@ -186,10 +186,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 9: @556 +// CHECK-NEXT: // Label 9: @540 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @557 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(598), // Rule ID 5 // +// CHECK-NEXT: // Label 2: @541 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(582), // Rule ID 5 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled), // CHECK-NEXT: // MIs[0] tmp // CHECK-NEXT: GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1] @@ -207,32 +207,32 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner2), // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 10: @598 +// CHECK-NEXT: // Label 10: @582 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @599 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(611), // Rule ID 0 // +// CHECK-NEXT: // Label 3: @583 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(595), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 11: @611 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(623), // Rule ID 1 // +// CHECK-NEXT: // Label 11: @595 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(607), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 12: @623 +// CHECK-NEXT: // Label 12: @607 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 4: @624 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(636), // Rule ID 2 // +// CHECK-NEXT: // Label 4: @608 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(620), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 13: @636 +// CHECK-NEXT: // Label 13: @620 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 5: @637 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(676), // Rule ID 7 // +// CHECK-NEXT: // Label 5: @621 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(660), // Rule ID 7 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled), // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: // No operand predicates @@ -247,10 +247,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 14: @676 +// CHECK-NEXT: // Label 14: @660 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 6: @677 +// CHECK-NEXT: // Label 6: @661 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 678 bytes +// CHECK-NEXT: }; // Size: 662 bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } From 5375cbfb6255ed19a6bed7065a697905ca65d575 Mon Sep 17 00:00:00 2001 From: Matteo Franciolini Date: Tue, 20 Feb 2024 21:40:36 -0800 Subject: [PATCH 051/351] Fix pipeline-invalid.mlir bytecode roundtrip test (#82366) If an op was not contained in a region when was written to bytecode, we don't have an initialized valueScope with forward references to define. --- mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp index 7cf3bd83b925c..d61634062784c 100644 --- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp +++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp @@ -2334,8 +2334,11 @@ BytecodeReader::Impl::parseOpWithoutRegions(EncodingReader &reader, Operation *op = Operation::create(opState); readState.curBlock->push_back(op); - // If the operation had results, update the value references. - if (op->getNumResults() && failed(defineValues(reader, op->getResults()))) + // If the operation had results, update the value references. We don't need to + // do this if the current value scope is empty. That is, the op was not + // encoded within a parent region. + if (readState.numValues && op->getNumResults() && + failed(defineValues(reader, op->getResults()))) return failure(); /// Store a map for every value that received a custom use-list order from the From 44b717df4d837ce4e8d76b00cee2e122ae6ad28c Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Wed, 21 Feb 2024 00:42:22 -0500 Subject: [PATCH 052/351] [GlobalISel] Clamp out-of-range G_EXTRACT_VECTOR_ELT constant indices when converting them into loads. (#82460) This avoid turning a poison value into a segfault, and fixes https://github.com/llvm/llvm-project/issues/78383 --- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 18 +++++---- .../AArch64/extractvector-oob-load.mir | 38 +++++++++++++++++++ 2 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/extractvector-oob-load.mir diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index e5b229fcd54f5..044cd3d2d426e 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3971,14 +3971,18 @@ LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment, return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx); } -static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg, - LLT VecTy) { - int64_t IdxVal; - if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) - return IdxReg; - +static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg, + LLT VecTy) { LLT IdxTy = B.getMRI()->getType(IdxReg); unsigned NElts = VecTy.getNumElements(); + + int64_t IdxVal; + if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) { + if (IdxVal < VecTy.getNumElements()) + return IdxReg; + // If a constant index would be out of bounds, clamp it as well. + } + if (isPowerOf2_32(NElts)) { APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts)); return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0); @@ -3997,7 +4001,7 @@ Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, assert(EltSize * 8 == EltTy.getSizeInBits() && "Converting bits to bytes lost precision"); - Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy); + Index = clampVectorIndex(MIRBuilder, Index, VecTy); LLT IdxTy = MRI.getType(Index); auto Mul = MIRBuilder.buildMul(IdxTy, Index, diff --git a/llvm/test/CodeGen/AArch64/extractvector-oob-load.mir b/llvm/test/CodeGen/AArch64/extractvector-oob-load.mir new file mode 100644 index 0000000000000..e8c5819e75e09 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/extractvector-oob-load.mir @@ -0,0 +1,38 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: f +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +liveins: + - { reg: '$x0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: f + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64)) + ; CHECK-NEXT: $x0 = COPY [[LOAD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(p0) = COPY $x0 + %3:_(s64) = G_CONSTANT i64 224567957 + %1:_(<3 x s64>) = G_LOAD %0(p0) :: (load (<3 x s64>), align 32) + %2:_(s64) = G_EXTRACT_VECTOR_ELT %1(<3 x s64>), %3(s64) + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... From ec516ff3e6122069b36f32a6db8bb3dc672133fc Mon Sep 17 00:00:00 2001 From: Jooyung Han Date: Wed, 21 Feb 2024 14:46:25 +0900 Subject: [PATCH 053/351] Fix __isOSVersionAtLeast for Android (#80496) Allow pre-release APIs on pre-release devices. The current implementation requires __ANDROID_API_FUTURE__ to use new APIs on pre-release system. This makes it hard to maintain the codebase because it should be switched a concrete version (e.g. __ANDROID_API_X__ on release of X). Instead, we can just allow pre-release APIs on pre-release system without mandating the major version of __ANDROID_API_FUTURE__. Note that this doesn't make API guards just no-op in pre-release builds. We can still rely on its compile-time checks and it still works as expected with release builds. Even with pre-release builds, it's the same as before because we would pass __ANDROID_API_FUTURE__ to make the calls anyway. --- compiler-rt/lib/builtins/os_version_check.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/builtins/os_version_check.c b/compiler-rt/lib/builtins/os_version_check.c index 182eabe7a6ae2..01fae834ab219 100644 --- a/compiler-rt/lib/builtins/os_version_check.c +++ b/compiler-rt/lib/builtins/os_version_check.c @@ -316,8 +316,8 @@ int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, readSystemProperties); - return SdkVersion >= Major || - (IsPreRelease && Major == __ANDROID_API_FUTURE__); + // Allow all on pre-release. Note that we still rely on compile-time checks. + return SdkVersion >= Major || IsPreRelease; } #else From 04fbc461e0fd1c6f2b014761e9c03ca80d17b33b Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 20 Feb 2024 21:51:51 -0800 Subject: [PATCH 054/351] [clang-format] Fix RemoveSemicolon for empty functions (#82278) Fixes #79833. --- clang/lib/Format/Format.cpp | 23 ++++++++++++++++------- clang/unittests/Format/FormatTest.cpp | 19 +++++++++++++------ 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 2c815128b1a59..10ab406a15c6e 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -2261,27 +2261,36 @@ class SemiRemover : public TokenAnalyzer { FormatTokenLexer &Tokens) override { AffectedRangeMgr.computeAffectedLines(AnnotatedLines); tooling::Replacements Result; - removeSemi(AnnotatedLines, Result); + removeSemi(Annotator, AnnotatedLines, Result); return {Result, 0}; } private: - void removeSemi(SmallVectorImpl &Lines, + void removeSemi(TokenAnnotator &Annotator, + SmallVectorImpl &Lines, tooling::Replacements &Result) { + auto PrecededByFunctionRBrace = [](const FormatToken &Tok) { + const auto *Prev = Tok.Previous; + if (!Prev || Prev->isNot(tok::r_brace)) + return false; + const auto *LBrace = Prev->MatchingParen; + return LBrace && LBrace->is(TT_FunctionLBrace); + }; const auto &SourceMgr = Env.getSourceManager(); const auto End = Lines.end(); for (auto I = Lines.begin(); I != End; ++I) { const auto Line = *I; - removeSemi(Line->Children, Result); + removeSemi(Annotator, Line->Children, Result); if (!Line->Affected) continue; + Annotator.calculateFormattingInformation(*Line); const auto NextLine = I + 1 == End ? nullptr : I[1]; for (auto Token = Line->First; Token && !Token->Finalized; Token = Token->Next) { - if (!Token->Optional) - continue; - if (Token->isNot(tok::semi)) + if (Token->isNot(tok::semi) || + (!Token->Optional && !PrecededByFunctionRBrace(*Token))) { continue; + } auto Next = Token->Next; assert(Next || Token == Line->Last); if (!Next && NextLine) @@ -3677,7 +3686,7 @@ reformat(const FormatStyle &Style, StringRef Code, FormatStyle S = Expanded; S.RemoveSemicolon = true; Passes.emplace_back([&, S = std::move(S)](const Environment &Env) { - return SemiRemover(Env, S).process(/*SkipAnnotation=*/true); + return SemiRemover(Env, S).process(); }); } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 24f62af8ddcb8..8282e75bd847f 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -26720,13 +26720,20 @@ TEST_F(FormatTest, RemoveSemicolon) { verifyIncompleteFormat("class C final [[deprecated(l]] {});", Style); - // These tests are here to show a problem that may not be easily - // solved, our implementation to remove semicolons is only as good - // as our FunctionLBrace detection and this fails for empty braces - // because we can't distringuish this from a bracelist. - // We will enable when that is resolved. -#if 0 verifyFormat("void main() {}", "void main() {};", Style); + + verifyFormat("struct Foo {\n" + " Foo() {}\n" + " ~Foo() {}\n" + "};", + "struct Foo {\n" + " Foo() {};\n" + " ~Foo() {};\n" + "};", + Style); + +// We can't (and probably shouldn't) support the following. +#if 0 verifyFormat("void foo() {} //\n" "int bar;", "void foo() {}; //\n" From ab7dcb0ef634ef370618aa244ad28d8c654b894c Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Feb 2024 05:57:16 +0000 Subject: [PATCH 055/351] [gn build] Port a2afcd572186 --- llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn index 59df787dbb712..e78ef13869e64 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -124,7 +124,6 @@ static_library("CodeGen") { "MachineCSE.cpp", "MachineCheckDebugify.cpp", "MachineCombiner.cpp", - "MachineConvergenceVerifier.cpp", "MachineCopyPropagation.cpp", "MachineCycleAnalysis.cpp", "MachineDebugify.cpp", From 8b23d68a621f16b6d66e68cb64b99f1221b9df2c Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Tue, 20 Feb 2024 23:02:03 -0800 Subject: [PATCH 056/351] [Analyzer] Support RefAllowingPartiallyDestroyed and RefPtrAllowingPartiallyDestroyed (#82209) This PR adds the support for WebKit's RefAllowingPartiallyDestroyed and RefPtrAllowingPartiallyDestroyed, which are smart pointer types which may be used after the destructor had started running. --- .../Checkers/WebKit/PtrTypesSemantics.cpp | 29 ++++++------ .../Analysis/Checkers/WebKit/mock-types.h | 1 + .../ref-allowing-partially-destroyed.cpp | 44 +++++++++++++++++++ 3 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 clang/test/Analysis/Checkers/WebKit/ref-allowing-partially-destroyed.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index a7891d2da07c1..defd83ec8e179 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -103,15 +103,18 @@ std::optional isRefCountable(const CXXRecordDecl* R) return hasRef && hasDeref; } +bool isRefType(const std::string &Name) { + return Name == "Ref" || Name == "RefAllowingPartiallyDestroyed" || + Name == "RefPtr" || Name == "RefPtrAllowingPartiallyDestroyed"; +} + bool isCtorOfRefCounted(const clang::FunctionDecl *F) { assert(F); - const auto &FunctionName = safeGetName(F); - - return FunctionName == "Ref" || FunctionName == "makeRef" - - || FunctionName == "RefPtr" || FunctionName == "makeRefPtr" + const std::string &FunctionName = safeGetName(F); - || FunctionName == "UniqueRef" || FunctionName == "makeUniqueRef" || + return isRefType(FunctionName) || FunctionName == "makeRef" || + FunctionName == "makeRefPtr" || FunctionName == "UniqueRef" || + FunctionName == "makeUniqueRef" || FunctionName == "makeUniqueRefWithoutFastMallocCheck" || FunctionName == "String" || FunctionName == "AtomString" || @@ -131,7 +134,7 @@ bool isReturnValueRefCounted(const clang::FunctionDecl *F) { if (auto *specialT = type->getAs()) { if (auto *decl = specialT->getTemplateName().getAsTemplateDecl()) { auto name = decl->getNameAsString(); - return name == "Ref" || name == "RefPtr"; + return isRefType(name); } return false; } @@ -172,20 +175,18 @@ std::optional isGetterOfRefCounted(const CXXMethodDecl* M) if (isa(M)) { const CXXRecordDecl *calleeMethodsClass = M->getParent(); auto className = safeGetName(calleeMethodsClass); - auto methodName = safeGetName(M); + auto method = safeGetName(M); - if (((className == "Ref" || className == "RefPtr") && - methodName == "get") || - (className == "Ref" && methodName == "ptr") || + if ((isRefType(className) && (method == "get" || method == "ptr")) || ((className == "String" || className == "AtomString" || className == "AtomStringImpl" || className == "UniqueString" || className == "UniqueStringImpl" || className == "Identifier") && - methodName == "impl")) + method == "impl")) return true; // Ref -> T conversion // FIXME: Currently allowing any Ref -> whatever cast. - if (className == "Ref" || className == "RefPtr") { + if (isRefType(className)) { if (auto *maybeRefToRawOperator = dyn_cast(M)) { if (auto *targetConversionType = maybeRefToRawOperator->getConversionType().getTypePtrOrNull()) { @@ -202,7 +203,7 @@ bool isRefCounted(const CXXRecordDecl *R) { if (auto *TmplR = R->getTemplateInstantiationPattern()) { // FIXME: String/AtomString/UniqueString const auto &ClassName = safeGetName(TmplR); - return ClassName == "RefPtr" || ClassName == "Ref"; + return isRefType(ClassName); } return false; } diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h index 82db67bb031dd..e2b3401d40739 100644 --- a/clang/test/Analysis/Checkers/WebKit/mock-types.h +++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h @@ -16,6 +16,7 @@ template struct Ref { } T *get() { return t; } T *ptr() { return t; } + T *operator->() { return t; } operator const T &() const { return *t; } operator T &() { return *t; } }; diff --git a/clang/test/Analysis/Checkers/WebKit/ref-allowing-partially-destroyed.cpp b/clang/test/Analysis/Checkers/WebKit/ref-allowing-partially-destroyed.cpp new file mode 100644 index 0000000000000..6d96c14102a90 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/ref-allowing-partially-destroyed.cpp @@ -0,0 +1,44 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s +// expected-no-diagnostics + +#include "mock-types.h" + +template struct RefAllowingPartiallyDestroyed { + T *t; + + RefAllowingPartiallyDestroyed() : t{} {}; + RefAllowingPartiallyDestroyed(T &) {} + T *get() { return t; } + T *ptr() { return t; } + T *operator->() { return t; } + operator const T &() const { return *t; } + operator T &() { return *t; } +}; + +template struct RefPtrAllowingPartiallyDestroyed { + T *t; + + RefPtrAllowingPartiallyDestroyed() : t(new T) {} + RefPtrAllowingPartiallyDestroyed(T *t) : t(t) {} + T *get() { return t; } + T *operator->() { return t; } + const T *operator->() const { return t; } + T &operator*() { return *t; } + RefPtrAllowingPartiallyDestroyed &operator=(T *) { return *this; } + operator bool() { return t; } +}; + +class RefCounted { +public: + void ref() const; + void deref() const; + void someFunction(); +}; + +RefAllowingPartiallyDestroyed object1(); +RefPtrAllowingPartiallyDestroyed object2(); + +void testFunction() { + object1()->someFunction(); + object2()->someFunction(); +} From a445474d3fdec2bdaaa42a6dc83c2fb01867076f Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Wed, 21 Feb 2024 15:04:29 +0800 Subject: [PATCH 057/351] [RISCV] Use TImmLeaf for csr_sysreg (#82463) And use `getTargetConstant` to create operands. This PR addresses comments after committing #82322. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 8 ++++---- llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 ++- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 87f7813c5d5d5..25a27a91a1635 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -11734,14 +11734,14 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SDValue LoCounter, HiCounter; MVT XLenVT = Subtarget.getXLenVT(); if (N->getOpcode() == ISD::READCYCLECOUNTER) { - LoCounter = DAG.getConstant( + LoCounter = DAG.getTargetConstant( RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding, DL, XLenVT); - HiCounter = DAG.getConstant( + HiCounter = DAG.getTargetConstant( RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding, DL, XLenVT); } else { - LoCounter = DAG.getConstant( + LoCounter = DAG.getTargetConstant( RISCVSysReg::lookupSysRegByName("TIME")->Encoding, DL, XLenVT); - HiCounter = DAG.getConstant( + HiCounter = DAG.getTargetConstant( RISCVSysReg::lookupSysRegByName("TIMEH")->Encoding, DL, XLenVT); } SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 83b1c68eea61a..a38463f810270 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -127,7 +127,8 @@ enum NodeType : unsigned { FMAX, FMIN, // A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)). - // It takes a chain operand. + // It takes a chain operand and another two target constant operands (the + // CSR numbers of the low and high parts of the counter). READ_COUNTER_WIDE, // brev8, orc.b, zip, and unzip from Zbb and Zbkb. All operands are i32 or diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 0d2ffac4883a3..e753c1f1add0c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -365,7 +365,7 @@ def CSRSystemRegister : AsmOperandClass { let DiagnosticType = "InvalidCSRSystemRegister"; } -def csr_sysreg : RISCVOp, ImmLeaf(Imm);"> { +def csr_sysreg : RISCVOp, TImmLeaf(Imm);"> { let ParserMatchClass = CSRSystemRegister; let PrintMethod = "printCSRSystemRegister"; let DecoderMethod = "decodeUImmOperand<12>"; From 351e4fa2bfe5b13073c1675a1b1693ea766c1e25 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 21 Feb 2024 08:46:47 +0100 Subject: [PATCH 058/351] [Clang] Fix assert when transforming a pack indexing type. (#82234) When a pack in a pack indexing specifier cannot be immediately expanded, we were creating an incomplete TypeLoc (causing assertion failure). As we do not keep track of typelocs of expanded elements, we create a trivial typeloc Fixes #81697 --- clang/lib/Sema/TreeTransform.h | 6 +++++- clang/test/SemaCXX/cxx2c-pack-indexing.cpp | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index a32a585531873..7389a48fe56fc 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -6561,7 +6561,11 @@ TreeTransform::TransformPackIndexingType(TypeLocBuilder &TLB, return QualType(); if (!ShouldExpand) { Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); - QualType Pack = getDerived().TransformType(T); + // FIXME: should we keep TypeLoc for individual expansions in + // PackIndexingTypeLoc? + TypeSourceInfo *TI = + SemaRef.getASTContext().getTrivialTypeSourceInfo(T, TL.getBeginLoc()); + QualType Pack = getDerived().TransformType(TLB, TI->getTypeLoc()); if (Pack.isNull()) return QualType(); if (NotYetExpanded) { diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp index 625a56031598b..e13635383b6ca 100644 --- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp +++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp @@ -135,3 +135,22 @@ using Splice = typename SpliceImpl::type; using type = Splice, IL<1, 2>>; static_assert(is_same>); } + + +namespace GH81697 { + +template struct tuple { + int __x0; +}; + +template +Ts...[I]& get(tuple& t) { + return t.__x0; +} + +void f() { + tuple x; + get<0>(x); +} + +} From d3fb596c9720b8bf192823730e9fccc3d86de9a8 Mon Sep 17 00:00:00 2001 From: Francesco Petrogalli Date: Wed, 21 Feb 2024 08:47:39 +0100 Subject: [PATCH 059/351] [RISCV] Fix scheduling info for compressed LD/ST of FP types. (#82339) --- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 07137031d9fc7..18d38348f7214 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -317,7 +317,7 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd), let Predicates = [HasStdExtCOrZcd, HasStdExtD] in def C_FLD : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>, - Sched<[WriteFLD64, ReadMemBase]> { + Sched<[WriteFLD64, ReadFMemBase]> { bits<8> imm; let Inst{12-10} = imm{5-3}; let Inst{6-5} = imm{7-6}; @@ -334,7 +334,7 @@ def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00>, let DecoderNamespace = "RISCV32Only_", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in def C_FLW : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>, - Sched<[WriteFLD32, ReadMemBase]> { + Sched<[WriteFLD32, ReadFMemBase]> { bits<7> imm; let Inst{12-10} = imm{5-3}; let Inst{6} = imm{2}; @@ -351,7 +351,7 @@ def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>, let Predicates = [HasStdExtCOrZcd, HasStdExtD] in def C_FSD : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>, - Sched<[WriteFST64, ReadStoreData, ReadMemBase]> { + Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> { bits<8> imm; let Inst{12-10} = imm{5-3}; let Inst{6-5} = imm{7-6}; @@ -368,7 +368,7 @@ def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00>, let DecoderNamespace = "RISCV32Only_", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in def C_FSW : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>, - Sched<[WriteFST32, ReadStoreData, ReadMemBase]> { + Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> { bits<7> imm; let Inst{12-10} = imm{5-3}; let Inst{6} = imm{2}; @@ -506,7 +506,7 @@ def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb), let Predicates = [HasStdExtCOrZcd, HasStdExtD] in def C_FLDSP : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>, - Sched<[WriteFLD64, ReadMemBase]> { + Sched<[WriteFLD64, ReadFMemBase]> { let Inst{6-5} = imm{4-3}; let Inst{4-2} = imm{8-6}; } @@ -520,7 +520,7 @@ def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00>, let DecoderNamespace = "RISCV32Only_", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in def C_FLWSP : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>, - Sched<[WriteFLD32, ReadMemBase]> { + Sched<[WriteFLD32, ReadFMemBase]> { let Inst{6-4} = imm{4-2}; let Inst{3-2} = imm{7-6}; } @@ -564,7 +564,7 @@ def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rs1_wb), let Predicates = [HasStdExtCOrZcd, HasStdExtD] in def C_FSDSP : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>, - Sched<[WriteFST64, ReadStoreData, ReadMemBase]> { + Sched<[WriteFST64, ReadFStoreData, ReadFMemBase]> { let Inst{12-10} = imm{5-3}; let Inst{9-7} = imm{8-6}; } @@ -578,7 +578,7 @@ def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00>, let DecoderNamespace = "RISCV32Only_", Predicates = [HasStdExtCOrZcfOrZce, HasStdExtF, IsRV32] in def C_FSWSP : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>, - Sched<[WriteFST32, ReadStoreData, ReadMemBase]> { + Sched<[WriteFST32, ReadFStoreData, ReadFMemBase]> { let Inst{12-9} = imm{5-2}; let Inst{8-7} = imm{7-6}; } From 7ce1a11f7f436234ce3eaf11c74043937a1ec36b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 21 Feb 2024 09:14:48 +0100 Subject: [PATCH 060/351] [InstCombine] Fold dependent IVs (#81151) Fold `iv = phi(start, iv.next = iv2.next + start)` where `iv2 = phi(iv2.start, iv2.next = iv2 + iv2.step)` to `iv = iv2 + start` removing one induction variable from the loop. Proof: https://alive2.llvm.org/ce/z/hfmwgf Fixes https://github.com/llvm/llvm-project/issues/77108. --- .../Transforms/InstCombine/InstCombinePHI.cpp | 55 +++++++++++++ .../Transforms/InstCombine/dependent-ivs.ll | 80 ++++++++----------- 2 files changed, 87 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 20b34c1379d57..192ccbbcb7c7b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -1378,6 +1378,58 @@ static Value *simplifyUsingControlFlow(InstCombiner &Self, PHINode &PN, return nullptr; } +// Fold iv = phi(start, iv.next = iv2.next op start) +// where iv2 = phi(iv2.start, iv2.next = iv2 + iv2.step) +// and iv2.start op start = start +// to iv = iv2 op start +static Value *foldDependentIVs(PHINode &PN, IRBuilderBase &Builder) { + BasicBlock *BB = PN.getParent(); + if (PN.getNumIncomingValues() != 2) + return nullptr; + + Value *Start; + Instruction *IvNext; + BinaryOperator *Iv2Next; + auto MatchOuterIV = [&](Value *V1, Value *V2) { + if (match(V2, m_c_BinOp(m_Specific(V1), m_BinOp(Iv2Next))) || + match(V2, m_GEP(m_Specific(V1), m_BinOp(Iv2Next)))) { + Start = V1; + IvNext = cast(V2); + return true; + } + return false; + }; + + if (!MatchOuterIV(PN.getIncomingValue(0), PN.getIncomingValue(1)) && + !MatchOuterIV(PN.getIncomingValue(1), PN.getIncomingValue(0))) + return nullptr; + + PHINode *Iv2; + Value *Iv2Start, *Iv2Step; + if (!matchSimpleRecurrence(Iv2Next, Iv2, Iv2Start, Iv2Step) || + Iv2->getParent() != BB) + return nullptr; + + auto *BO = dyn_cast(IvNext); + Constant *Identity = + BO ? ConstantExpr::getBinOpIdentity(BO->getOpcode(), Iv2Start->getType()) + : Constant::getNullValue(Iv2Start->getType()); + if (Iv2Start != Identity) + return nullptr; + + Builder.SetInsertPoint(&*BB, BB->getFirstInsertionPt()); + if (!BO) { + auto *GEP = cast(IvNext); + return Builder.CreateGEP(GEP->getSourceElementType(), Start, Iv2, "", + cast(IvNext)->isInBounds()); + } + + assert(BO->isCommutative() && "Must be commutative"); + Value *Res = Builder.CreateBinOp(BO->getOpcode(), Iv2, Start); + cast(Res)->copyIRFlags(BO); + return Res; +} + // PHINode simplification // Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { @@ -1595,5 +1647,8 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { if (auto *V = simplifyUsingControlFlow(*this, PN, DT)) return replaceInstUsesWith(PN, V); + if (Value *Res = foldDependentIVs(PN, Builder)) + return replaceInstUsesWith(PN, Res); + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/dependent-ivs.ll b/llvm/test/Transforms/InstCombine/dependent-ivs.ll index d043c7388e434..c2cff61ecb388 100644 --- a/llvm/test/Transforms/InstCombine/dependent-ivs.ll +++ b/llvm/test/Transforms/InstCombine/dependent-ivs.ll @@ -7,11 +7,10 @@ define void @int_iv_nuw(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = add nuw i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -39,11 +38,10 @@ define void @int_iv_nsw(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = add nsw i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = add nsw i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -72,11 +70,10 @@ define void @int_iv_commuted_add(i64 %base, i64 %end) { ; CHECK-NEXT: [[BASE2:%.*]] = mul i64 [[BASE]], 42 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE2]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = add i64 [[IV]], [[BASE2]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[BASE2]], [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -105,11 +102,10 @@ define void @int_iv_commuted_phi1(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[BASE]], [[ENTRY:%.*]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV2:%.*]] = add i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -137,11 +133,10 @@ define void @int_iv_commuted_phi2(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = add i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = add i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -169,11 +164,10 @@ define void @int_iv_vector(<2 x i64> %base) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi <2 x i64> [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = add <2 x i64> [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.v2i64(<2 x i64> [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], -; CHECK-NEXT: [[IV2_NEXT]] = add <2 x i64> [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = call i1 @get.i1() ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -233,12 +227,11 @@ define void @int_iv_loop_variant_step(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = add nuw i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[STEP:%.*]] = call i64 @get.i64() ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[STEP]] -; CHECK-NEXT: [[IV2_NEXT]] = add nuw i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -267,11 +260,10 @@ define void @int_iv_xor(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = xor i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = xor i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -299,11 +291,10 @@ define void @int_iv_or(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = or i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = or i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -331,11 +322,10 @@ define void @int_iv_or_disjoint(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = or disjoint i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = or disjoint i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -363,11 +353,10 @@ define void @int_iv_and(i64 %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[IV2_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ -1, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ -1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV2:%.*]] = and i64 [[IV]], [[BASE]] ; CHECK-NEXT: call void @use.i64(i64 [[IV2]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV2_NEXT]] = and i64 [[IV_NEXT]], [[BASE]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -523,11 +512,10 @@ define void @ptr_iv_inbounds(ptr %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_PTR:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[IV]] ; CHECK-NEXT: call void @use.p0(ptr [[IV_PTR]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -555,11 +543,10 @@ define void @ptr_iv_no_inbounds(ptr %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_PTR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[IV]] ; CHECK-NEXT: call void @use.p0(ptr [[IV_PTR]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr i8, ptr [[BASE]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -587,11 +574,10 @@ define void @ptr_iv_non_i8_type(ptr %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_PTR:%.*]] = phi ptr [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_PTR:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[IV]] ; CHECK-NEXT: call void @use.p0(ptr [[IV_PTR]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr i32, ptr [[BASE]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -619,11 +605,10 @@ define void @ptr_iv_vector(<2 x ptr> %base, i64 %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_PTR:%.*]] = getelementptr inbounds i8, <2 x ptr> [[BASE]], i64 [[IV]] ; CHECK-NEXT: call void @use.v2p0(<2 x ptr> [[IV_PTR]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 4 -; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr inbounds i8, <2 x ptr> [[BASE]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[END]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -651,11 +636,10 @@ define void @ptr_iv_vector2(<2 x ptr> %base) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_PTR:%.*]] = phi <2 x ptr> [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ], [ [[BASE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi <2 x i64> [ [[IV_NEXT:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_PTR:%.*]] = getelementptr i8, <2 x ptr> [[BASE]], <2 x i64> [[IV]] ; CHECK-NEXT: call void @use.v2p0(<2 x ptr> [[IV_PTR]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw <2 x i64> [[IV]], -; CHECK-NEXT: [[IV_PTR_NEXT]] = getelementptr i8, <2 x ptr> [[BASE]], <2 x i64> [[IV_NEXT]] ; CHECK-NEXT: [[CMP:%.*]] = call i1 @get.i1() ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: From 1246b64faa5eea1553c1c1aad425c31b701fa6ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Wed, 21 Feb 2024 09:18:01 +0100 Subject: [PATCH 061/351] [clang][analyzer] Change modeling of 'fileno' in checkers. (#81842) Function 'fileno' fails only if invalid pointer is passed, this is a case that is often ignored in source code. The failure case leads to many "false positive" reports when `fileno` returns -1 and this is not checked in the program. Because this, the function is now assumed to not fail (this is assumption that the passed file pointer is correct). The change affects `StdCLibraryFunctionsChecker` and `StreamChecker`. --- .../Checkers/StdLibraryFunctionsChecker.cpp | 9 +- .../StaticAnalyzer/Checkers/StreamChecker.cpp | 193 ++++++++++-------- .../std-c-library-functions-path-notes.c | 22 +- clang/test/Analysis/stream-errno-note.c | 12 +- clang/test/Analysis/stream-errno.c | 16 +- clang/test/Analysis/stream-error.c | 18 ++ clang/test/Analysis/stream-noopen.c | 10 + 7 files changed, 159 insertions(+), 121 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 6b8ac2629453d..6cc8867945814 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -2388,12 +2388,15 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( .ArgConstraint(NotNull(ArgNo(0)))); // int fileno(FILE *stream); + // According to POSIX 'fileno' may fail and set 'errno'. + // But in Linux it may fail only if the specified file pointer is invalid. + // At many places 'fileno' is used without check for failure and a failure + // case here would produce a large amount of likely false positive warnings. + // To avoid this, we assume here that it does not fail. addToFunctionSummaryMap( "fileno", Signature(ArgTypes{FilePtrTy}, RetType{IntTy}), Summary(NoEvalCall) - .Case(ReturnsValidFileDescriptor, ErrnoMustNotBeChecked, - GenericSuccessMsg) - .Case(ReturnsMinusOne, ErrnoNEZeroIrrelevant, GenericFailureMsg) + .Case(ReturnsValidFileDescriptor, ErrnoUnchanged, GenericSuccessMsg) .ArgConstraint(NotNull(ArgNo(0)))); // void rewind(FILE *stream); diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp index 7e7e3f0eee2b4..a070f451694a3 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp @@ -249,6 +249,10 @@ struct StreamOperationEvaluator { bool isStreamEof() const { return SS->ErrorState == ErrorFEof; } + NonLoc getZeroVal(const CallEvent &Call) { + return *SVB.makeZeroVal(Call.getResultType()).getAs(); + } + ProgramStateRef setStreamState(ProgramStateRef State, const StreamState &NewSS) { return State->set(StreamSym, NewSS); @@ -390,7 +394,8 @@ class StreamChecker : public Checker FnTestDescriptions = { @@ -486,6 +491,9 @@ class StreamChecker : public CheckerBindExpr(E.CE, C.getLocationContext(), RetVal); StateNotFailed = - E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, - *E.SVB.makeZeroVal(E.ACtx.IntTy).getAs()); + E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, E.getZeroVal(Call)); if (!StateNotFailed) return; StateNotFailed = @@ -1003,8 +1010,7 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call, ProgramStateRef StateNotFailed = State->BindExpr(E.CE, C.getLocationContext(), RetVal); StateNotFailed = - E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, - *E.SVB.makeZeroVal(E.ACtx.IntTy).getAs()); + E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, E.getZeroVal(Call)); if (StateNotFailed) C.addTransition(StateNotFailed); } @@ -1073,8 +1079,7 @@ void StreamChecker::evalGetdelim(const FnDescription *Desc, ProgramStateRef StateNotFailed = State->BindExpr(E.CE, C.getLocationContext(), RetVal); StateNotFailed = - E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, - *E.SVB.makeZeroVal(E.CE->getType()).getAs()); + E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, E.getZeroVal(Call)); if (!StateNotFailed) return; C.addTransition(StateNotFailed); @@ -1200,8 +1205,7 @@ void StreamChecker::evalFtell(const FnDescription *Desc, const CallEvent &Call, ProgramStateRef StateNotFailed = State->BindExpr(E.CE, C.getLocationContext(), RetVal); StateNotFailed = - E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, - *E.SVB.makeZeroVal(Call.getResultType()).getAs()); + E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, E.getZeroVal(Call)); if (!StateNotFailed) return; @@ -1226,79 +1230,6 @@ void StreamChecker::evalRewind(const FnDescription *Desc, const CallEvent &Call, C.addTransition(State); } -void StreamChecker::evalClearerr(const FnDescription *Desc, - const CallEvent &Call, - CheckerContext &C) const { - ProgramStateRef State = C.getState(); - StreamOperationEvaluator E(C); - if (!E.Init(Desc, Call, C, State)) - return; - - // FilePositionIndeterminate is not cleared. - State = E.setStreamState( - State, - StreamState::getOpened(Desc, ErrorNone, E.SS->FilePositionIndeterminate)); - C.addTransition(State); -} - -void StreamChecker::evalFeofFerror(const FnDescription *Desc, - const CallEvent &Call, CheckerContext &C, - const StreamErrorState &ErrorKind) const { - ProgramStateRef State = C.getState(); - StreamOperationEvaluator E(C); - if (!E.Init(Desc, Call, C, State)) - return; - - if (E.SS->ErrorState & ErrorKind) { - // Execution path with error of ErrorKind. - // Function returns true. - // From now on it is the only one error state. - ProgramStateRef TrueState = bindAndAssumeTrue(State, C, E.CE); - C.addTransition(E.setStreamState( - TrueState, StreamState::getOpened(Desc, ErrorKind, - E.SS->FilePositionIndeterminate && - !ErrorKind.isFEof()))); - } - if (StreamErrorState NewES = E.SS->ErrorState & (~ErrorKind)) { - // Execution path(s) with ErrorKind not set. - // Function returns false. - // New error state is everything before minus ErrorKind. - ProgramStateRef FalseState = E.bindReturnValue(State, C, 0); - C.addTransition(E.setStreamState( - FalseState, - StreamState::getOpened( - Desc, NewES, E.SS->FilePositionIndeterminate && !NewES.isFEof()))); - } -} - -void StreamChecker::preDefault(const FnDescription *Desc, const CallEvent &Call, - CheckerContext &C) const { - ProgramStateRef State = C.getState(); - SVal StreamVal = getStreamArg(Desc, Call); - State = ensureStreamNonNull(StreamVal, Call.getArgExpr(Desc->StreamArgNo), C, - State); - if (!State) - return; - State = ensureStreamOpened(StreamVal, C, State); - if (!State) - return; - - C.addTransition(State); -} - -void StreamChecker::evalSetFeofFerror(const FnDescription *Desc, - const CallEvent &Call, CheckerContext &C, - const StreamErrorState &ErrorKind) const { - ProgramStateRef State = C.getState(); - SymbolRef StreamSym = getStreamArg(Desc, Call).getAsSymbol(); - assert(StreamSym && "Operation not permitted on non-symbolic stream value."); - const StreamState *SS = State->get(StreamSym); - assert(SS && "Stream should be tracked by the checker."); - State = State->set( - StreamSym, StreamState::getOpened(SS->LastOperation, ErrorKind)); - C.addTransition(State); -} - void StreamChecker::preFflush(const FnDescription *Desc, const CallEvent &Call, CheckerContext &C) const { ProgramStateRef State = C.getState(); @@ -1377,6 +1308,104 @@ void StreamChecker::evalFflush(const FnDescription *Desc, const CallEvent &Call, C.addTransition(StateFailed); } +void StreamChecker::evalClearerr(const FnDescription *Desc, + const CallEvent &Call, + CheckerContext &C) const { + ProgramStateRef State = C.getState(); + StreamOperationEvaluator E(C); + if (!E.Init(Desc, Call, C, State)) + return; + + // FilePositionIndeterminate is not cleared. + State = E.setStreamState( + State, + StreamState::getOpened(Desc, ErrorNone, E.SS->FilePositionIndeterminate)); + C.addTransition(State); +} + +void StreamChecker::evalFeofFerror(const FnDescription *Desc, + const CallEvent &Call, CheckerContext &C, + const StreamErrorState &ErrorKind) const { + ProgramStateRef State = C.getState(); + StreamOperationEvaluator E(C); + if (!E.Init(Desc, Call, C, State)) + return; + + if (E.SS->ErrorState & ErrorKind) { + // Execution path with error of ErrorKind. + // Function returns true. + // From now on it is the only one error state. + ProgramStateRef TrueState = bindAndAssumeTrue(State, C, E.CE); + C.addTransition(E.setStreamState( + TrueState, StreamState::getOpened(Desc, ErrorKind, + E.SS->FilePositionIndeterminate && + !ErrorKind.isFEof()))); + } + if (StreamErrorState NewES = E.SS->ErrorState & (~ErrorKind)) { + // Execution path(s) with ErrorKind not set. + // Function returns false. + // New error state is everything before minus ErrorKind. + ProgramStateRef FalseState = E.bindReturnValue(State, C, 0); + C.addTransition(E.setStreamState( + FalseState, + StreamState::getOpened( + Desc, NewES, E.SS->FilePositionIndeterminate && !NewES.isFEof()))); + } +} + +void StreamChecker::evalFileno(const FnDescription *Desc, const CallEvent &Call, + CheckerContext &C) const { + // Fileno should fail only if the passed pointer is invalid. + // Some of the preconditions are checked already in preDefault. + // Here we can assume that the operation does not fail, because if we + // introduced a separate branch where fileno() returns -1, then it would cause + // many unexpected and unwanted warnings in situations where fileno() is + // called on valid streams. + // The stream error states are not modified by 'fileno', and 'errno' is also + // left unchanged (so this evalCall does not invalidate it, but we have a + // custom evalCall instead of the default that would invalidate it). + ProgramStateRef State = C.getState(); + StreamOperationEvaluator E(C); + if (!E.Init(Desc, Call, C, State)) + return; + + NonLoc RetVal = makeRetVal(C, E.CE).castAs(); + State = State->BindExpr(E.CE, C.getLocationContext(), RetVal); + State = E.assumeBinOpNN(State, BO_GE, RetVal, E.getZeroVal(Call)); + if (!State) + return; + + C.addTransition(State); +} + +void StreamChecker::preDefault(const FnDescription *Desc, const CallEvent &Call, + CheckerContext &C) const { + ProgramStateRef State = C.getState(); + SVal StreamVal = getStreamArg(Desc, Call); + State = ensureStreamNonNull(StreamVal, Call.getArgExpr(Desc->StreamArgNo), C, + State); + if (!State) + return; + State = ensureStreamOpened(StreamVal, C, State); + if (!State) + return; + + C.addTransition(State); +} + +void StreamChecker::evalSetFeofFerror(const FnDescription *Desc, + const CallEvent &Call, CheckerContext &C, + const StreamErrorState &ErrorKind) const { + ProgramStateRef State = C.getState(); + SymbolRef StreamSym = getStreamArg(Desc, Call).getAsSymbol(); + assert(StreamSym && "Operation not permitted on non-symbolic stream value."); + const StreamState *SS = State->get(StreamSym); + assert(SS && "Stream should be tracked by the checker."); + State = State->set( + StreamSym, StreamState::getOpened(SS->LastOperation, ErrorKind)); + C.addTransition(State); +} + ProgramStateRef StreamChecker::ensureStreamNonNull(SVal StreamVal, const Expr *StreamE, CheckerContext &C, diff --git a/clang/test/Analysis/std-c-library-functions-path-notes.c b/clang/test/Analysis/std-c-library-functions-path-notes.c index 4df00fe1e6064..6449b71928fa7 100644 --- a/clang/test/Analysis/std-c-library-functions-path-notes.c +++ b/clang/test/Analysis/std-c-library-functions-path-notes.c @@ -61,24 +61,22 @@ int test_islower(int *x) { } int test_bugpath_notes(FILE *f1, char c, FILE *f2) { - int f = fileno(f2); - if (f == -1) // \ + // This test has the purpose of checking that notes appear at correct place. + long a = ftell(f2); // no note + if (a == -1) // \ // expected-note{{Taking false branch}} - return 0; - int l = islower(c); - f = fileno(f1); // \ - // expected-note{{Value assigned to 'f'}} \ - // expected-note{{Assuming that 'fileno' fails}} - return dup(f); // \ + return -1; + int l = islower(c); // no note + a = ftell(f1); // \ + // expected-note{{Value assigned to 'a'}} \ + // expected-note{{Assuming that 'ftell' fails}} + return dup(a); // \ // expected-warning{{The 1st argument to 'dup' is -1 but should be >= 0}} \ // expected-note{{The 1st argument to 'dup' is -1 but should be >= 0}} } int test_fileno_arg_note(FILE *f1) { - return dup(fileno(f1)); // \ - // expected-warning{{The 1st argument to 'dup' is < 0 but should be >= 0}} \ - // expected-note{{The 1st argument to 'dup' is < 0 but should be >= 0}} \ - // expected-note{{Assuming that 'fileno' fails}} + return dup(fileno(f1)); // no warning } int test_readlink_bufsize_zero(char *Buf, size_t Bufsize) { diff --git a/clang/test/Analysis/stream-errno-note.c b/clang/test/Analysis/stream-errno-note.c index 2531e26e20038..2411a2d9a00a7 100644 --- a/clang/test/Analysis/stream-errno-note.c +++ b/clang/test/Analysis/stream-errno-note.c @@ -141,16 +141,8 @@ void check_rewind_errnocheck(void) { } void check_fileno(void) { - FILE *F = tmpfile(); - // expected-note@+2{{'F' is non-null}} - // expected-note@+1{{Taking false branch}} - if (!F) - return; - fileno(F); - // expected-note@-1{{Assuming that 'fileno' is successful; 'errno' becomes undefined after the call}} - if (errno) {} // expected-warning{{An undefined value may be read from 'errno'}} - // expected-note@-1{{An undefined value may be read from 'errno'}} - (void)fclose(F); + // nothing to check: checker assumes that 'fileno' is always successful + // (and does not change 'errno') } void check_fwrite_zeroarg(size_t Siz) { diff --git a/clang/test/Analysis/stream-errno.c b/clang/test/Analysis/stream-errno.c index fab6a58b3275a..5f0a58032fa26 100644 --- a/clang/test/Analysis/stream-errno.c +++ b/clang/test/Analysis/stream-errno.c @@ -173,6 +173,8 @@ void check_no_errno_change(void) { if (errno) {} // no-warning ferror(F); if (errno) {} // no-warning + fileno(F); + if (errno) {} // no-warning clang_analyzer_eval(errno == 1); // expected-warning{{TRUE}} fclose(F); } @@ -250,20 +252,6 @@ void check_rewind(void) { fclose(F); } -void check_fileno(void) { - FILE *F = tmpfile(); - if (!F) - return; - int N = fileno(F); - if (N == -1) { - clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}} - if (errno) {} // no-warning - fclose(F); - return; - } - if (errno) {} // expected-warning{{An undefined value may be read from 'errno'}} -} - void check_fflush_opened_file(void) { FILE *F = tmpfile(); if (!F) diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c index 4bab07577ccd5..ac31083bfc691 100644 --- a/clang/test/Analysis/stream-error.c +++ b/clang/test/Analysis/stream-error.c @@ -491,6 +491,24 @@ void error_ftello(void) { fclose(F); } +void error_fileno(void) { + FILE *F = fopen("file", "r"); + if (!F) + return; + int N = fileno(F); + clang_analyzer_eval(N >= 0); // expected-warning {{TRUE}} + clang_analyzer_eval(feof(F) && ferror(F)); // expected-warning {{FALSE}} + StreamTesterChecker_make_feof_stream(F); + N = fileno(F); + clang_analyzer_eval(feof(F)); // expected-warning {{TRUE}} + clang_analyzer_eval(ferror(F)); // expected-warning {{FALSE}} + StreamTesterChecker_make_ferror_stream(F); + N = fileno(F); + clang_analyzer_eval(feof(F)); // expected-warning {{FALSE}} + clang_analyzer_eval(ferror(F)); // expected-warning {{TRUE}} + fclose(F); +} + void error_fflush_on_non_null_stream_clear_error_states(void) { FILE *F0 = tmpfile(), *F1 = tmpfile(); // `fflush` clears a non-EOF stream's error state. diff --git a/clang/test/Analysis/stream-noopen.c b/clang/test/Analysis/stream-noopen.c index 8bd01a90cf859..644c699d05e24 100644 --- a/clang/test/Analysis/stream-noopen.c +++ b/clang/test/Analysis/stream-noopen.c @@ -268,6 +268,16 @@ void test_clearerr(FILE *F) { // expected-warning@-1{{FALSE}} } +void test_fileno(FILE *F) { + errno = 0; + int A = fileno(F); + clang_analyzer_eval(F != NULL); // expected-warning{{TRUE}} + clang_analyzer_eval(A >= 0); // expected-warning{{TRUE}} + if (errno) {} // no-warning + clang_analyzer_eval(errno == 0); // expected-warning{{TRUE}} + // expected-warning@-1{{FALSE}} +} + void freadwrite_zerosize(FILE *F) { fwrite(WBuf, 1, 0, F); clang_analyzer_eval(feof(F)); // expected-warning {{UNKNOWN}} From 02fad0565fe7f061bdaa79ff33b29f64b2c290eb Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 21 Feb 2024 16:27:43 +0800 Subject: [PATCH 062/351] [RISCV][SDAG] Fold `select c, ~x, x` into `xor -c, x` (#82462) This patch lowers select of constants if `TrueV == ~FalseV`. Address the comment in https://github.com/llvm/llvm-project/pull/82456#discussion_r1496881603. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 ++ llvm/test/CodeGen/RISCV/select.ll | 177 ++++++++++++++++++++ 2 files changed, 187 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 25a27a91a1635..c2fef4993f6ec 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7239,6 +7239,16 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG, } } + // select c, ~x, x --> xor -c, x + if (isa(TrueV) && isa(FalseV)) { + const APInt &TrueVal = TrueV->getAsAPIntVal(); + const APInt &FalseVal = FalseV->getAsAPIntVal(); + if (~TrueVal == FalseVal) { + SDValue Neg = DAG.getNegative(CondV, DL, VT); + return DAG.getNode(ISD::XOR, DL, VT, Neg, FalseV); + } + } + // Try to fold (select (setcc lhs, rhs, cc), truev, falsev) into bitwise ops // when both truev and falsev are also setcc. if (CondV.getOpcode() == ISD::SETCC && TrueV.getOpcode() == ISD::SETCC && diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll index 7dd223df5e557..e01984b7c5843 100644 --- a/llvm/test/CodeGen/RISCV/select.ll +++ b/llvm/test/CodeGen/RISCV/select.ll @@ -1449,3 +1449,180 @@ entry: %res = select i1 %cond, i32 %a, i32 %c ret i32 %res } + +define i32 @select_cst_not1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: select_cst_not1: +; CHECK: # %bb.0: +; CHECK-NEXT: slt a0, a0, a1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: xori a0, a0, -6 +; CHECK-NEXT: ret + %cond = icmp slt i32 %a, %b + %ret = select i1 %cond, i32 5, i32 -6 + ret i32 %ret +} + +define i32 @select_cst_not2(i32 signext %a) { +; CHECK-LABEL: select_cst_not2: +; CHECK: # %bb.0: +; CHECK-NEXT: srai a0, a0, 31 +; CHECK-NEXT: xori a0, a0, -6 +; CHECK-NEXT: ret + %cond = icmp slt i32 %a, 0 + %ret = select i1 %cond, i32 5, i32 -6 + ret i32 %ret +} + +define i32 @select_cst_not3(i32 signext %a) { +; CHECK-LABEL: select_cst_not3: +; CHECK: # %bb.0: +; CHECK-NEXT: srai a0, a0, 31 +; CHECK-NEXT: xori a0, a0, 5 +; CHECK-NEXT: ret + %cond = icmp sgt i32 %a, -1 + %ret = select i1 %cond, i32 5, i32 -6 + ret i32 %ret +} + +define i32 @select_cst_not4(i32 signext %a, i32 signext %b) { +; RV32IM-LABEL: select_cst_not4: +; RV32IM: # %bb.0: +; RV32IM-NEXT: slt a0, a0, a1 +; RV32IM-NEXT: lui a1, 524288 +; RV32IM-NEXT: addi a1, a1, -1 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst_not4: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slt a0, a0, a1 +; RV64IM-NEXT: neg a0, a0 +; RV64IM-NEXT: lui a1, 524288 +; RV64IM-NEXT: addiw a1, a1, -1 +; RV64IM-NEXT: xor a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst_not4: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: slt a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: neg a0, a0 +; RV64IMXVTCONDOPS-NEXT: lui a1, 524288 +; RV64IMXVTCONDOPS-NEXT: addiw a1, a1, -1 +; RV64IMXVTCONDOPS-NEXT: xor a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: ret +; +; RV32IMZICOND-LABEL: select_cst_not4: +; RV32IMZICOND: # %bb.0: +; RV32IMZICOND-NEXT: slt a0, a0, a1 +; RV32IMZICOND-NEXT: lui a1, 524288 +; RV32IMZICOND-NEXT: addi a1, a1, -1 +; RV32IMZICOND-NEXT: add a0, a0, a1 +; RV32IMZICOND-NEXT: ret +; +; RV64IMZICOND-LABEL: select_cst_not4: +; RV64IMZICOND: # %bb.0: +; RV64IMZICOND-NEXT: slt a0, a0, a1 +; RV64IMZICOND-NEXT: neg a0, a0 +; RV64IMZICOND-NEXT: lui a1, 524288 +; RV64IMZICOND-NEXT: addiw a1, a1, -1 +; RV64IMZICOND-NEXT: xor a0, a0, a1 +; RV64IMZICOND-NEXT: ret + %cond = icmp slt i32 %a, %b + %ret = select i1 %cond, i32 -2147483648, i32 2147483647 + ret i32 %ret +} + +define i32 @select_cst_not5(i32 signext %a, i32 signext %b) { +; RV32IM-LABEL: select_cst_not5: +; RV32IM: # %bb.0: +; RV32IM-NEXT: slt a0, a0, a1 +; RV32IM-NEXT: neg a0, a0 +; RV32IM-NEXT: lui a1, 16 +; RV32IM-NEXT: addi a1, a1, -5 +; RV32IM-NEXT: xor a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst_not5: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slt a0, a0, a1 +; RV64IM-NEXT: neg a0, a0 +; RV64IM-NEXT: lui a1, 16 +; RV64IM-NEXT: addiw a1, a1, -5 +; RV64IM-NEXT: xor a0, a0, a1 +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst_not5: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: slt a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: neg a0, a0 +; RV64IMXVTCONDOPS-NEXT: lui a1, 16 +; RV64IMXVTCONDOPS-NEXT: addiw a1, a1, -5 +; RV64IMXVTCONDOPS-NEXT: xor a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: ret +; +; RV32IMZICOND-LABEL: select_cst_not5: +; RV32IMZICOND: # %bb.0: +; RV32IMZICOND-NEXT: slt a0, a0, a1 +; RV32IMZICOND-NEXT: neg a0, a0 +; RV32IMZICOND-NEXT: lui a1, 16 +; RV32IMZICOND-NEXT: addi a1, a1, -5 +; RV32IMZICOND-NEXT: xor a0, a0, a1 +; RV32IMZICOND-NEXT: ret +; +; RV64IMZICOND-LABEL: select_cst_not5: +; RV64IMZICOND: # %bb.0: +; RV64IMZICOND-NEXT: slt a0, a0, a1 +; RV64IMZICOND-NEXT: neg a0, a0 +; RV64IMZICOND-NEXT: lui a1, 16 +; RV64IMZICOND-NEXT: addiw a1, a1, -5 +; RV64IMZICOND-NEXT: xor a0, a0, a1 +; RV64IMZICOND-NEXT: ret + %cond = icmp slt i32 %a, %b + %ret = select i1 %cond, i32 -65532, i32 65531 + ret i32 %ret +} + +define i32 @select_cst_unknown(i32 signext %a, i32 signext %b) { +; RV32IM-LABEL: select_cst_unknown: +; RV32IM: # %bb.0: +; RV32IM-NEXT: mv a2, a0 +; RV32IM-NEXT: li a0, 5 +; RV32IM-NEXT: blt a2, a1, .LBB42_2 +; RV32IM-NEXT: # %bb.1: +; RV32IM-NEXT: li a0, -7 +; RV32IM-NEXT: .LBB42_2: +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst_unknown: +; RV64IM: # %bb.0: +; RV64IM-NEXT: mv a2, a0 +; RV64IM-NEXT: li a0, 5 +; RV64IM-NEXT: blt a2, a1, .LBB42_2 +; RV64IM-NEXT: # %bb.1: +; RV64IM-NEXT: li a0, -7 +; RV64IM-NEXT: .LBB42_2: +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst_unknown: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: slt a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: li a1, -7 +; RV64IMXVTCONDOPS-NEXT: vt.maskcn a1, a1, a0 +; RV64IMXVTCONDOPS-NEXT: li a2, 5 +; RV64IMXVTCONDOPS-NEXT: vt.maskc a0, a2, a0 +; RV64IMXVTCONDOPS-NEXT: or a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: ret +; +; CHECKZICOND-LABEL: select_cst_unknown: +; CHECKZICOND: # %bb.0: +; CHECKZICOND-NEXT: slt a0, a0, a1 +; CHECKZICOND-NEXT: li a1, -7 +; CHECKZICOND-NEXT: czero.nez a1, a1, a0 +; CHECKZICOND-NEXT: li a2, 5 +; CHECKZICOND-NEXT: czero.eqz a0, a2, a0 +; CHECKZICOND-NEXT: or a0, a0, a1 +; CHECKZICOND-NEXT: ret + %cond = icmp slt i32 %a, %b + %ret = select i1 %cond, i32 5, i32 -7 + ret i32 %ret +} From 8b84de26dfc1ba742b427e45bc900bc233fd58e1 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 21 Feb 2024 09:48:20 +0100 Subject: [PATCH 063/351] =?UTF-8?q?[llvm-exegesis][NFC]=20Refactor=20all?= =?UTF-8?q?=20`ValidationEvent`=20info=20in=20a=20single=20=E2=80=A6=20(#8?= =?UTF-8?q?2256)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …table. All data is derived from a single table rather than being spread out over an enum, a table and the main entry point. This is intended as a replacement for #82092. --- llvm/include/llvm/Target/TargetPfmCounters.td | 1 + .../llvm-exegesis/lib/BenchmarkResult.cpp | 49 +-------------- .../tools/llvm-exegesis/lib/BenchmarkResult.h | 15 +---- llvm/tools/llvm-exegesis/lib/CMakeLists.txt | 1 + .../lib/LatencyBenchmarkRunner.cpp | 4 +- llvm/tools/llvm-exegesis/lib/Target.h | 1 + .../llvm-exegesis/lib/ValidationEvent.cpp | 56 +++++++++++++++++ .../tools/llvm-exegesis/lib/ValidationEvent.h | 60 +++++++++++++++++++ llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 20 +------ 9 files changed, 127 insertions(+), 80 deletions(-) create mode 100644 llvm/tools/llvm-exegesis/lib/ValidationEvent.cpp create mode 100644 llvm/tools/llvm-exegesis/lib/ValidationEvent.h diff --git a/llvm/include/llvm/Target/TargetPfmCounters.td b/llvm/include/llvm/Target/TargetPfmCounters.td index 8c4d5f50c63a2..cfe432a992b71 100644 --- a/llvm/include/llvm/Target/TargetPfmCounters.td +++ b/llvm/include/llvm/Target/TargetPfmCounters.td @@ -35,6 +35,7 @@ class ValidationEvent { int EventNumber = event_number; } +// TableGen names for events defined in `llvm::exegesis::ValidationEvent`. def InstructionRetired : ValidationEvent<0>; def L1DCacheLoadMiss : ValidationEvent<1>; def L1DCacheStoreMiss : ValidationEvent<2>; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp index 189add6464173..f84ebd2a4e68e 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp @@ -9,6 +9,7 @@ #include "BenchmarkResult.h" #include "BenchmarkRunner.h" #include "Error.h" +#include "ValidationEvent.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringRef.h" @@ -198,7 +199,7 @@ struct CustomMappingTraits> { static void inputOne(IO &Io, StringRef KeyStr, std::map &VI) { Expected Key = - exegesis::stringToValidationEvent(KeyStr); + exegesis::getValidationEventByName(KeyStr); if (!Key) { Io.setError("Key is not a valid validation event"); return; @@ -208,7 +209,7 @@ struct CustomMappingTraits> { static void output(IO &Io, std::map &VI) { for (auto &IndividualVI : VI) { - Io.mapRequired(exegesis::validationEventToString(IndividualVI.first), + Io.mapRequired(exegesis::getValidationEventName(IndividualVI.first), IndividualVI.second); } } @@ -441,49 +442,5 @@ bool operator==(const BenchmarkMeasure &A, const BenchmarkMeasure &B) { std::tie(B.Key, B.PerInstructionValue, B.PerSnippetValue); } -const char *validationEventToString(ValidationEvent VE) { - switch (VE) { - case exegesis::ValidationEvent::InstructionRetired: - return "instructions-retired"; - case exegesis::ValidationEvent::L1DCacheLoadMiss: - return "l1d-cache-load-misses"; - case exegesis::ValidationEvent::L1DCacheStoreMiss: - return "l1d-cache-store-misses"; - case exegesis::ValidationEvent::L1ICacheLoadMiss: - return "l1i-cache-load-misses"; - case exegesis::ValidationEvent::DataTLBLoadMiss: - return "data-tlb-load-misses"; - case exegesis::ValidationEvent::DataTLBStoreMiss: - return "data-tlb-store-misses"; - case exegesis::ValidationEvent::InstructionTLBLoadMiss: - return "instruction-tlb-load-misses"; - case exegesis::ValidationEvent::BranchPredictionMiss: - return "branch-prediction-misses"; - } - llvm_unreachable("Unhandled exegesis::ValidationEvent enum"); -} - -Expected stringToValidationEvent(StringRef Input) { - if (Input == "instructions-retired") - return exegesis::ValidationEvent::InstructionRetired; - else if (Input == "l1d-cache-load-misses") - return exegesis::ValidationEvent::L1DCacheLoadMiss; - else if (Input == "l1d-cache-store-misses") - return exegesis::ValidationEvent::L1DCacheStoreMiss; - else if (Input == "l1i-cache-load-misses") - return exegesis::ValidationEvent::L1ICacheLoadMiss; - else if (Input == "data-tlb-load-misses") - return exegesis::ValidationEvent::DataTLBLoadMiss; - else if (Input == "data-tlb-store-misses") - return exegesis::ValidationEvent::DataTLBStoreMiss; - else if (Input == "instruction-tlb-load-misses") - return exegesis::ValidationEvent::InstructionTLBLoadMiss; - else if (Input == "branch-prediction-misses") - return exegesis::ValidationEvent::BranchPredictionMiss; - else - return make_error("Invalid validation event string", - errc::invalid_argument); -} - } // namespace exegesis } // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h index 60115c51bba32..0aecaaeea4b2e 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -17,6 +17,7 @@ #include "LlvmState.h" #include "RegisterValue.h" +#include "ValidationEvent.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" @@ -32,20 +33,6 @@ class Error; namespace exegesis { -enum ValidationEvent { - InstructionRetired, - L1DCacheLoadMiss, - L1DCacheStoreMiss, - L1ICacheLoadMiss, - DataTLBLoadMiss, - DataTLBStoreMiss, - InstructionTLBLoadMiss, - BranchPredictionMiss -}; - -const char *validationEventToString(exegesis::ValidationEvent VE); -Expected stringToValidationEvent(StringRef Input); - enum class BenchmarkPhaseSelectorE { PrepareSnippet, PrepareAndAssembleSnippet, diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt index 6ae441d31f07f..414b49e5e021c 100644 --- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt @@ -73,6 +73,7 @@ add_llvm_library(LLVMExegesis SubprocessMemory.cpp Target.cpp UopsBenchmarkRunner.cpp + ValidationEvent.cpp LINK_LIBS ${libs} diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp index a9917a29cce24..de61fff643294 100644 --- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp @@ -107,8 +107,8 @@ Expected> LatencyBenchmarkRunner::runMeasurements( } for (size_t I = 0; I < ValCounterValues.size(); ++I) { - LLVM_DEBUG(dbgs() << validationEventToString(ValidationCounters[I]) - << ": " << IterationValCounterValues[I] << "\n"); + LLVM_DEBUG(dbgs() << getValidationEventName(ValidationCounters[I]) << ": " + << IterationValCounterValues[I] << "\n"); ValCounterValues[I] += IterationValCounterValues[I]; } } diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h index 3d6169c965021..7bbd946b03331 100644 --- a/llvm/tools/llvm-exegesis/lib/Target.h +++ b/llvm/tools/llvm-exegesis/lib/Target.h @@ -22,6 +22,7 @@ #include "LlvmState.h" #include "PerfHelper.h" #include "SnippetGenerator.h" +#include "ValidationEvent.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/LegacyPassManager.h" diff --git a/llvm/tools/llvm-exegesis/lib/ValidationEvent.cpp b/llvm/tools/llvm-exegesis/lib/ValidationEvent.cpp new file mode 100644 index 0000000000000..c965b7ae55e10 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/ValidationEvent.cpp @@ -0,0 +1,56 @@ + +#include "ValidationEvent.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" + +namespace llvm { +namespace exegesis { + +namespace { + +struct ValidationEventInfo { + const char *const Name; + const char *const Description; +}; + +// Information about validation events, indexed by `ValidationEvent` enum +// value. +static constexpr ValidationEventInfo ValidationEventInfos[] = { + {"instructions-retired", "Count retired instructions"}, + {"l1d-cache-load-misses", "Count L1D load cache misses"}, + {"l1d-cache-store-misses", "Count L1D store cache misses"}, + {"l1i-cache-load-misses", "Count L1I load cache misses"}, + {"data-tlb-load-misses", "Count DTLB load misses"}, + {"data-tlb-store-misses", "Count DTLB store misses"}, + {"instruction-tlb-load-misses", "Count ITLB load misses"}, + {"branch-prediction-misses", "Branch prediction misses"}, +}; + +static_assert(sizeof(ValidationEventInfos) == + NumValidationEvents * sizeof(ValidationEventInfo), + "please update ValidationEventInfos"); + +} // namespace + +const char *getValidationEventName(ValidationEvent VE) { + return ValidationEventInfos[VE].Name; +} +const char *getValidationEventDescription(ValidationEvent VE) { + return ValidationEventInfos[VE].Description; +} + +Expected getValidationEventByName(StringRef Name) { + int VE = 0; + for (const ValidationEventInfo &Info : ValidationEventInfos) { + if (Name == Info.Name) + return static_cast(VE); + ++VE; + } + + return make_error("Invalid validation event string", + errc::invalid_argument); +} + +} // namespace exegesis +} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/ValidationEvent.h b/llvm/tools/llvm-exegesis/lib/ValidationEvent.h new file mode 100644 index 0000000000000..8a9f3af57dca9 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/ValidationEvent.h @@ -0,0 +1,60 @@ +//===-- ValidationEvent.h ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Definitions and utilities for Validation Events. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_VALIDATIONEVENT_H +#define LLVM_TOOLS_LLVM_EXEGESIS_VALIDATIONEVENT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +namespace llvm { + +namespace exegesis { + +// The main list of supported validation events. The mapping between validation +// events and pfm counters is defined in TableDef files for each target. +enum ValidationEvent { + InstructionRetired, + L1DCacheLoadMiss, + L1DCacheStoreMiss, + L1ICacheLoadMiss, + DataTLBLoadMiss, + DataTLBStoreMiss, + InstructionTLBLoadMiss, + BranchPredictionMiss, + // Number of events. + NumValidationEvents, +}; + +// Returns the name/description of the given event. +const char *getValidationEventName(ValidationEvent VE); +const char *getValidationEventDescription(ValidationEvent VE); + +// Returns the ValidationEvent with the given name. +Expected getValidationEventByName(StringRef Name); + +// Command-line options for validation events. +struct ValidationEventOptions { + template void apply(Opt &O) const { + for (int I = 0; I < NumValidationEvents; ++I) { + const auto VE = static_cast(I); + O.getParser().addLiteralOption(getValidationEventName(VE), VE, + getValidationEventDescription(VE)); + } + } +}; + +} // namespace exegesis +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index ac279029e6b00..66387bdec5a5a 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -25,6 +25,7 @@ #include "lib/SnippetRepetitor.h" #include "lib/Target.h" #include "lib/TargetSelect.h" +#include "lib/ValidationEvent.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCInstBuilder.h" @@ -278,24 +279,7 @@ static cl::list ValidationCounters( cl::desc( "The name of a validation counter to run concurrently with the main " "counter to validate benchmarking assumptions"), - cl::CommaSeparated, cl::cat(BenchmarkOptions), - cl::values( - clEnumValN(ValidationEvent::InstructionRetired, "instructions-retired", - "Count retired instructions"), - clEnumValN(ValidationEvent::L1DCacheLoadMiss, "l1d-cache-load-misses", - "Count L1D load cache misses"), - clEnumValN(ValidationEvent::L1DCacheStoreMiss, "l1d-cache-store-misses", - "Count L1D store cache misses"), - clEnumValN(ValidationEvent::L1ICacheLoadMiss, "l1i-cache-load-misses", - "Count L1I load cache misses"), - clEnumValN(ValidationEvent::DataTLBLoadMiss, "data-tlb-load-misses", - "Count DTLB load misses"), - clEnumValN(ValidationEvent::DataTLBStoreMiss, "data-tlb-store-misses", - "Count DTLB store misses"), - clEnumValN(ValidationEvent::InstructionTLBLoadMiss, - "instruction-tlb-load-misses", "Count ITLB load misses"), - clEnumValN(ValidationEvent::BranchPredictionMiss, - "branch-prediction-misses", "Branch prediction misses"))); + cl::CommaSeparated, cl::cat(BenchmarkOptions), ValidationEventOptions()); static ExitOnError ExitOnErr("llvm-exegesis error: "); From 50373506d570f3db1e1af7c13d46409736452f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Wed, 21 Feb 2024 09:52:19 +0100 Subject: [PATCH 064/351] [clang] Preserve found-decl when constructing VarTemplateIds (#82265) --- clang/include/clang/Sema/Sema.h | 2 +- clang/lib/Sema/SemaTemplate.cpp | 18 ++++++++---------- clang/test/AST/ast-dump-using.cpp | 7 +++++++ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 89215bf3d1c69..23e1a623a20d1 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -8538,7 +8538,7 @@ class Sema final { /// if the arguments are dependent. ExprResult CheckVarTemplateId(const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, - VarTemplateDecl *Template, + VarTemplateDecl *Template, NamedDecl *FoundD, SourceLocation TemplateLoc, const TemplateArgumentListInfo *TemplateArgs); diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 1a975a8d0a0df..7d3d665194add 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4958,11 +4958,10 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc, return Decl; } -ExprResult -Sema::CheckVarTemplateId(const CXXScopeSpec &SS, - const DeclarationNameInfo &NameInfo, - VarTemplateDecl *Template, SourceLocation TemplateLoc, - const TemplateArgumentListInfo *TemplateArgs) { +ExprResult Sema::CheckVarTemplateId( + const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, + VarTemplateDecl *Template, NamedDecl *FoundD, SourceLocation TemplateLoc, + const TemplateArgumentListInfo *TemplateArgs) { DeclResult Decl = CheckVarTemplateId(Template, TemplateLoc, NameInfo.getLoc(), *TemplateArgs); @@ -4978,8 +4977,7 @@ Sema::CheckVarTemplateId(const CXXScopeSpec &SS, NameInfo.getLoc()); // Build an ordinary singleton decl ref. - return BuildDeclarationNameExpr(SS, NameInfo, Var, - /*FoundD=*/nullptr, TemplateArgs); + return BuildDeclarationNameExpr(SS, NameInfo, Var, FoundD, TemplateArgs); } void Sema::diagnoseMissingTemplateArguments(TemplateName Name, @@ -5066,9 +5064,9 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS, bool KnownDependent = false; // In C++1y, check variable template ids. if (R.getAsSingle()) { - ExprResult Res = CheckVarTemplateId(SS, R.getLookupNameInfo(), - R.getAsSingle(), - TemplateKWLoc, TemplateArgs); + ExprResult Res = CheckVarTemplateId( + SS, R.getLookupNameInfo(), R.getAsSingle(), + R.getRepresentativeDecl(), TemplateKWLoc, TemplateArgs); if (Res.isInvalid() || Res.isUsable()) return Res; // Result is dependent. Carry on to build an UnresolvedLookupEpxr. diff --git a/clang/test/AST/ast-dump-using.cpp b/clang/test/AST/ast-dump-using.cpp index 5a4e910ffb865..8e5c60d3aabf4 100644 --- a/clang/test/AST/ast-dump-using.cpp +++ b/clang/test/AST/ast-dump-using.cpp @@ -2,6 +2,7 @@ namespace a { struct S; +template T x = {}; } namespace b { using a::S; @@ -21,4 +22,10 @@ typedef S e; // check the same UsingType is reused. // CHECK-NEXT: `-UsingType [[TYPE_ADDR]] 'a::S' sugar // CHECK-NEXT: |-UsingShadow [[SHADOW_ADDR]] 'S' // CHECK-NEXT: `-RecordType {{.*}} 'a::S' +using a::x; + +void foo() { + x = 3; + // CHECK: DeclRefExpr {{.*}} 'x' {{.*}} (UsingShadow {{.*}} 'x') +} } From f8c1af1d096b97a42e4ab178c93accfc4e5fa288 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 21 Feb 2024 09:00:43 +0000 Subject: [PATCH 065/351] [gn build] Port 8b84de26dfc1 --- llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn index 7e58064d8b99a..57d1003161678 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/BUILD.gn @@ -45,6 +45,7 @@ static_library("lib") { "SubprocessMemory.cpp", "Target.cpp", "UopsBenchmarkRunner.cpp", + "ValidationEvent.cpp", ] deps += targets_with_exegesis From 4725993f1a812c86b9ad79d229a015d0216ff550 Mon Sep 17 00:00:00 2001 From: martinboehme Date: Wed, 21 Feb 2024 10:10:25 +0100 Subject: [PATCH 066/351] [clang][dataflow] Correctly handle `InitListExpr` of union type. (#82348) --- .../FlowSensitive/DataflowEnvironment.h | 9 ++++--- .../FlowSensitive/DataflowEnvironment.cpp | 18 ++++++++++--- clang/lib/Analysis/FlowSensitive/Transfer.cpp | 25 +++++++++++-------- .../Analysis/FlowSensitive/TestingSupport.h | 19 ++++++++++++++ .../Analysis/FlowSensitive/TransferTest.cpp | 14 +++++++++-- 5 files changed, 65 insertions(+), 20 deletions(-) diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h index 0aecc749bf415..b3dc940705f87 100644 --- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h +++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h @@ -753,9 +753,12 @@ RecordStorageLocation *getImplicitObjectLocation(const CXXMemberCallExpr &MCE, RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME, const Environment &Env); -/// Returns the fields of `RD` that are initialized by an `InitListExpr`, in the -/// order in which they appear in `InitListExpr::inits()`. -std::vector getFieldsForInitListExpr(const RecordDecl *RD); +/// Returns the fields of a `RecordDecl` that are initialized by an +/// `InitListExpr`, in the order in which they appear in +/// `InitListExpr::inits()`. +/// `Init->getType()` must be a record type. +std::vector +getFieldsForInitListExpr(const InitListExpr *InitList); /// Associates a new `RecordValue` with `Loc` and returns the new value. RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env); diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index d487944ce9211..0cfc26ea952cd 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -361,8 +361,8 @@ getFieldsGlobalsAndFuncs(const Stmt &S, FieldSet &Fields, if (const auto *FD = dyn_cast(VD)) Fields.insert(FD); } else if (auto *InitList = dyn_cast(&S)) { - if (RecordDecl *RD = InitList->getType()->getAsRecordDecl()) - for (const auto *FD : getFieldsForInitListExpr(RD)) + if (InitList->getType()->isRecordType()) + for (const auto *FD : getFieldsForInitListExpr(InitList)) Fields.insert(FD); } } @@ -1104,12 +1104,22 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME, return Env.get(*Base); } -std::vector getFieldsForInitListExpr(const RecordDecl *RD) { +std::vector +getFieldsForInitListExpr(const InitListExpr *InitList) { + const RecordDecl *RD = InitList->getType()->getAsRecordDecl(); + assert(RD != nullptr); + + std::vector Fields; + + if (InitList->getType()->isUnionType()) { + Fields.push_back(InitList->getInitializedFieldInUnion()); + return Fields; + } + // Unnamed bitfields are only used for padding and do not appear in // `InitListExpr`'s inits. However, those fields do appear in `RecordDecl`'s // field list, and we thus need to remove them before mapping inits to // fields to avoid mapping inits to the wrongs fields. - std::vector Fields; llvm::copy_if( RD->fields(), std::back_inserter(Fields), [](const FieldDecl *Field) { return !Field->isUnnamedBitfield(); }); diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp index fe13e919bddcd..cd1f04e53cff6 100644 --- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp +++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp @@ -663,14 +663,7 @@ class TransferVisitor : public ConstStmtVisitor { void VisitInitListExpr(const InitListExpr *S) { QualType Type = S->getType(); - if (Type->isUnionType()) { - // FIXME: Initialize unions properly. - if (auto *Val = Env.createValue(Type)) - Env.setValue(*S, *Val); - return; - } - - if (!Type->isStructureOrClassType()) { + if (!Type->isRecordType()) { // Until array initialization is implemented, we don't need to care about // cases where `getNumInits() > 1`. if (S->getNumInits() == 1) @@ -688,10 +681,9 @@ class TransferVisitor : public ConstStmtVisitor { llvm::DenseMap FieldLocs; // This only contains the direct fields for the given type. - std::vector FieldsForInit = - getFieldsForInitListExpr(Type->getAsRecordDecl()); + std::vector FieldsForInit = getFieldsForInitListExpr(S); - // `S->inits()` contains all the initializer epressions, including the + // `S->inits()` contains all the initializer expressions, including the // ones for direct base classes. auto Inits = S->inits(); size_t InitIdx = 0; @@ -731,6 +723,17 @@ class TransferVisitor : public ConstStmtVisitor { FieldLocs.insert({Field, &Loc}); } + // In the case of a union, we don't in general have initializers for all + // of the fields. Create storage locations for the remaining fields (but + // don't associate them with values). + if (Type->isUnionType()) { + for (const FieldDecl *Field : + Env.getDataflowAnalysisContext().getModeledFields(Type)) { + if (auto [it, inserted] = FieldLocs.insert({Field, nullptr}); inserted) + it->second = &Env.createStorageLocation(Field->getType()); + } + } + // Check that we satisfy the invariant that a `RecordStorageLoation` // contains exactly the set of modeled fields for that type. // `ModeledFields` includes fields from all the bases, but only the diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h index 0d36d2802897f..b7cf6cc966edb 100644 --- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h +++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h @@ -432,6 +432,8 @@ llvm::Error checkDataflowWithNoopAnalysis( {}); /// Returns the `ValueDecl` for the given identifier. +/// The returned pointer is guaranteed to be non-null; the function asserts if +/// no `ValueDecl` with the given name is found. /// /// Requirements: /// @@ -475,6 +477,15 @@ ValueT &getValueForDecl(ASTContext &ASTCtx, const Environment &Env, return *cast(Env.getValue(*VD)); } +/// Returns the storage location for the field called `Name` of `Loc`. +/// Optionally casts the field storage location to `T`. +template +std::enable_if_t, T &> +getFieldLoc(const RecordStorageLocation &Loc, llvm::StringRef Name, + ASTContext &ASTCtx) { + return *cast(Loc.getChild(*findValueDecl(ASTCtx, Name))); +} + /// Returns the value of a `Field` on the record referenced by `Loc.` /// Returns null if `Loc` is null. inline Value *getFieldValue(const RecordStorageLocation *Loc, @@ -487,6 +498,14 @@ inline Value *getFieldValue(const RecordStorageLocation *Loc, return Env.getValue(*FieldLoc); } +/// Returns the value of a `Field` on the record referenced by `Loc.` +/// Returns null if `Loc` is null. +inline Value *getFieldValue(const RecordStorageLocation *Loc, + llvm::StringRef Name, ASTContext &ASTCtx, + const Environment &Env) { + return getFieldValue(Loc, *findValueDecl(ASTCtx, Name), Env); +} + /// Creates and owns constraints which are boolean values. class ConstraintContext { unsigned NextAtom = 0; diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index a65b0446ac781..e7d74581865a3 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -2377,14 +2377,24 @@ TEST(TransferTest, InitListExprAsUnion) { } F; public: - constexpr target() : F{nullptr} {} + constexpr target() : F{nullptr} { + int *null = nullptr; + F.b; // Make sure we reference 'b' so it is modeled. + // [[p]] + } }; )cc"; runDataflow( Code, [](const llvm::StringMap> &Results, ASTContext &ASTCtx) { - // Just verify that it doesn't crash. + const Environment &Env = getEnvironmentAtAnnotation(Results, "p"); + + auto &FLoc = getFieldLoc( + *Env.getThisPointeeStorageLocation(), "F", ASTCtx); + auto *AVal = cast(getFieldValue(&FLoc, "a", ASTCtx, Env)); + ASSERT_EQ(AVal, &getValueForDecl(ASTCtx, Env, "null")); + ASSERT_EQ(getFieldValue(&FLoc, "b", ASTCtx, Env), nullptr); }); } From 0c13a896dfc930a09e082ad83070e223cfd9a4f9 Mon Sep 17 00:00:00 2001 From: Kohei Yamaguchi Date: Wed, 21 Feb 2024 18:11:22 +0900 Subject: [PATCH 067/351] [mlir][docs] Fix broken docs (#82308) - Fixed OpenACC's spec link format - Add missed `OpenACCPasses.md` into Passes.md - Add missed `MyExtensionCh4.md` into Ch4.md of tutorial of transform --- mlir/docs/Dialects/OpenACCDialect.md | 5 +++-- mlir/docs/Passes.md | 4 ++++ mlir/docs/Tutorials/transform/Ch4.md | 4 ++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/mlir/docs/Dialects/OpenACCDialect.md b/mlir/docs/Dialects/OpenACCDialect.md index ce0f1c3bbbba8..2f1bb194a167d 100755 --- a/mlir/docs/Dialects/OpenACCDialect.md +++ b/mlir/docs/Dialects/OpenACCDialect.md @@ -9,8 +9,8 @@ by giving the compiler the freedom of how to parallelize for specific architectures. OpenACC also provides the ability to optimize the parallelism through increasingly more prescriptive clauses. -This dialect models the constructs from the [OpenACC 3.3 specification] -(https://www.openacc.org/sites/default/files/inline-images/Specification/OpenACC-3.3-final.pdf) +This dialect models the constructs from the +[OpenACC 3.3 specification](https://www.openacc.org/sites/default/files/inline-images/Specification/OpenACC-3.3-final.pdf) This document describes the design of the OpenACC dialect in MLIR. It lists and explains design goals and design choices along with their @@ -451,3 +451,4 @@ dominates another. ## Operations TOC [include "Dialects/OpenACCDialectOps.md"] + diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md index ee7d47cc02272..84e6664436d7b 100644 --- a/mlir/docs/Passes.md +++ b/mlir/docs/Passes.md @@ -16,6 +16,10 @@ This document describes the available MLIR passes and their contracts. [include "ConversionPasses.md"] +## 'acc' Dialect Passes + +[include "OpenACCPasses.md"] + ## 'affine' Dialect Passes [include "AffinePasses.md"] diff --git a/mlir/docs/Tutorials/transform/Ch4.md b/mlir/docs/Tutorials/transform/Ch4.md index ad5221c6f6cca..c3159eb991d1e 100644 --- a/mlir/docs/Tutorials/transform/Ch4.md +++ b/mlir/docs/Tutorials/transform/Ch4.md @@ -579,3 +579,7 @@ accessed dimensions as other parameters that can be compared with each other to ensure the subscripts are `m,k` for LHS, `k,n` for RHS and `m,n` for the init/result given the `m,n,k` notation for loops. +## Appendix: Autogenerated Documentation + +[include "Tutorials/transform/MyExtensionCh4.md"] + From 07292b7203e31fb90d9180bfccde0d4e84be2245 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 21 Feb 2024 10:13:41 +0100 Subject: [PATCH 068/351] [LIR][SCEVExpander] Restore original flags when aborting transform (#82362) SCEVExpanderCleaner will currently remove instructions created by SCEVExpander, but not restore poison generating flags that it may have dropped. As such, running LIR can currently spuriously drop flags without performing any transforms. Fix this by keeping track of original instruction flags in SCEVExpander. Fixes https://github.com/llvm/llvm-project/issues/82337. --- .../Utils/ScalarEvolutionExpander.h | 20 +++++++++ .../Utils/ScalarEvolutionExpander.cpp | 42 +++++++++++++++++++ llvm/test/Transforms/LoopIdiom/pr82337.ll | 6 +-- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index fa10443f14bb7..9de0996fb1e30 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -41,6 +41,17 @@ struct SCEVOperand { const SCEV* S; }; +struct PoisonFlags { + unsigned NUW : 1; + unsigned NSW : 1; + unsigned Exact : 1; + unsigned Disjoint : 1; + unsigned NNeg : 1; + + PoisonFlags(const Instruction *I); + void apply(Instruction *I); +}; + /// This class uses information about analyze scalars to rewrite expressions /// in canonical form. /// @@ -48,6 +59,8 @@ struct SCEVOperand { /// and destroy it when finished to allow the release of the associated /// memory. class SCEVExpander : public SCEVVisitor { + friend class SCEVExpanderCleaner; + ScalarEvolution &SE; const DataLayout &DL; @@ -70,6 +83,10 @@ class SCEVExpander : public SCEVVisitor { /// InsertedValues/InsertedPostIncValues. SmallPtrSet ReusedValues; + /// Original flags of instructions for which they were modified. Used + /// by SCEVExpanderCleaner to undo changes. + DenseMap, PoisonFlags> OrigFlags; + // The induction variables generated. SmallVector InsertedIVs; @@ -188,6 +205,7 @@ class SCEVExpander : public SCEVVisitor { InsertedValues.clear(); InsertedPostIncValues.clear(); ReusedValues.clear(); + OrigFlags.clear(); ChainedPhis.clear(); InsertedIVs.clear(); } @@ -491,6 +509,8 @@ class SCEVExpander : public SCEVVisitor { void rememberInstruction(Value *I); + void rememberFlags(Instruction *I); + bool isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L); bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index fbe1dba5b8d4e..0f67cc3ff4fac 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -43,6 +43,37 @@ cl::opt llvm::SCEVCheapExpansionBudget( using namespace PatternMatch; +PoisonFlags::PoisonFlags(const Instruction *I) { + NUW = false; + NSW = false; + Exact = false; + Disjoint = false; + NNeg = false; + if (auto *OBO = dyn_cast(I)) { + NUW = OBO->hasNoUnsignedWrap(); + NSW = OBO->hasNoSignedWrap(); + } + if (auto *PEO = dyn_cast(I)) + Exact = PEO->isExact(); + if (auto *PDI = dyn_cast(I)) + Disjoint = PDI->isDisjoint(); + if (auto *PNI = dyn_cast(I)) + NNeg = PNI->hasNonNeg(); +} + +void PoisonFlags::apply(Instruction *I) { + if (isa(I)) { + I->setHasNoUnsignedWrap(NUW); + I->setHasNoSignedWrap(NSW); + } + if (isa(I)) + I->setIsExact(Exact); + if (auto *PDI = dyn_cast(I)) + PDI->setIsDisjoint(Disjoint); + if (auto *PNI = dyn_cast(I)) + PNI->setNonNeg(NNeg); +} + /// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP, /// reusing an existing cast if a suitable one (= dominating IP) exists, or /// creating a new one. @@ -724,6 +755,7 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos, auto FixupPoisonFlags = [this](Instruction *I) { // Drop flags that are potentially inferred from old context and infer flags // in new context. + rememberFlags(I); I->dropPoisonGeneratingFlags(); if (auto *OBO = dyn_cast(I)) if (auto Flags = SE.getStrengthenedNoWrapFlagsFromBinOp(OBO)) { @@ -1481,6 +1513,7 @@ Value *SCEVExpander::expand(const SCEV *S) { V = fixupLCSSAFormFor(V); } else { for (Instruction *I : DropPoisonGeneratingInsts) { + rememberFlags(I); I->dropPoisonGeneratingFlagsAndMetadata(); // See if we can re-infer from first principles any of the flags we just // dropped. @@ -1521,6 +1554,11 @@ void SCEVExpander::rememberInstruction(Value *I) { DoInsert(I); } +void SCEVExpander::rememberFlags(Instruction *I) { + // If we already have flags for the instruction, keep the existing ones. + OrigFlags.try_emplace(I, PoisonFlags(I)); +} + void SCEVExpander::replaceCongruentIVInc( PHINode *&Phi, PHINode *&OrigPhi, Loop *L, const DominatorTree *DT, SmallVectorImpl &DeadInsts) { @@ -2318,6 +2356,10 @@ void SCEVExpanderCleaner::cleanup() { if (ResultUsed) return; + // Restore original poison flags. + for (auto [I, Flags] : Expander.OrigFlags) + Flags.apply(I); + auto InsertedInstructions = Expander.getAllInsertedInstructions(); #ifndef NDEBUG SmallPtrSet InsertedSet(InsertedInstructions.begin(), diff --git a/llvm/test/Transforms/LoopIdiom/pr82337.ll b/llvm/test/Transforms/LoopIdiom/pr82337.ll index e8a6e1704f7c1..da9eb14af3f0a 100644 --- a/llvm/test/Transforms/LoopIdiom/pr82337.ll +++ b/llvm/test/Transforms/LoopIdiom/pr82337.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt -S -passes=loop-idiom < %s | FileCheck %s -; FIXME: The poison flags should be preserved, as no transform takes place. +; The poison flags should be preserved, as no transform takes place. define void @test(ptr %p.end, ptr %p.start) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[P_END:%.*]], ptr [[P_START:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P_END_INT:%.*]] = ptrtoint ptr [[P_END]] to i64 ; CHECK-NEXT: [[P_START_INT:%.*]] = ptrtoint ptr [[P_START]] to i64 -; CHECK-NEXT: [[DIST:%.*]] = sub i64 [[P_END_INT]], [[P_START_INT]] -; CHECK-NEXT: [[LEN:%.*]] = lshr i64 [[DIST]], 5 +; CHECK-NEXT: [[DIST:%.*]] = sub nuw i64 [[P_END_INT]], [[P_START_INT]] +; CHECK-NEXT: [[LEN:%.*]] = lshr exact i64 [[DIST]], 5 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P_END]], [[P_START]] ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[PREHEADER:%.*]] ; CHECK: preheader: From 1ff1e823836e6ed741c69681a2af9f1c3871e8c2 Mon Sep 17 00:00:00 2001 From: Tuan Chuong Goh Date: Tue, 20 Feb 2024 14:20:21 +0000 Subject: [PATCH 069/351] [AArch64][GlobalISel] Pre-Commit Tests for Refactor BITCAST --- llvm/test/CodeGen/AArch64/bitcast.ll | 508 ++++++++++++++++++++++++++- 1 file changed, 499 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index d60bd4ab3fc5f..bac9b48a4087b 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -1,12 +1,39 @@ -; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined. -define <4 x i16> @foo1(<2 x i32> %a) { -; CHECK-LABEL: foo1: -; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-GI: warning: Instruction selection used fallback path for bitcast_v4i8_i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_i32_v4i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v2i16_i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_i32_v2i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v2i16_v4i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v4i8_v2i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v4i64_v8i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v4i64_v16i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v8i32_v4i64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v8i32_v16i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v8i64_v16i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v16i16_v4i64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v16i16_v8i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v16i32_v8i64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v3i32_v6i16 +define <4 x i16> @foo1(<2 x i32> %a) { +; CHECK-SD-LABEL: foo1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: foo1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #58712 // =0xe558 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: rev32 v0.4h, v0.4h +; CHECK-GI-NEXT: ret %1 = shufflevector <2 x i32> , <2 x i32> %a, <2 x i32> ; Can't optimize the following bitcast to scalar_to_vector. %2 = bitcast <2 x i32> %1 to <4 x i16> @@ -15,13 +42,476 @@ define <4 x i16> @foo1(<2 x i32> %a) { } define <4 x i16> @foo2(<2 x i32> %a) { -; CHECK-LABEL: foo2: -; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret - +; CHECK-SD-LABEL: foo2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: foo2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #712 // =0x2c8 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: rev32 v0.4h, v0.4h +; CHECK-GI-NEXT: ret %1 = shufflevector <2 x i32> , <2 x i32> %a, <2 x i32> ; Can't optimize the following bitcast to scalar_to_vector. %2 = bitcast <2 x i32> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> ret <4 x i16> %3 } + +; ===== To and From Scalar Types ===== + +define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){ +; CHECK-LABEL: bitcast_v4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %c = add <4 x i8> %a, %b + %d = bitcast <4 x i8> %c to i32 + ret i32 %d +} + +define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){ +; CHECK-LABEL: bitcast_i32_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, w1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: ret + %c = add i32 %a, %b + %d = bitcast i32 %c to <4 x i8> + ret <4 x i8> %d +} + +define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ +; CHECK-LABEL: bitcast_v2i16_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strh w9, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: ldr w0, [sp, #12] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %c = add <2 x i16> %a, %b + %d = bitcast <2 x i16> %c to i32 + ret i32 %d +} + +define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){ +; CHECK-LABEL: bitcast_i32_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, w1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = add i32 %a, %b + %d = bitcast i32 %c to <2 x i16> + ret <2 x i16> %d +} + +define i64 @bitcast_v8i8_i64(<8 x i8> %a, <8 x i8> %b){ +; CHECK-LABEL: bitcast_v8i8_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %c = add <8 x i8> %a, %b + %d = bitcast <8 x i8> %c to i64 + ret i64 %d +} + +define <8 x i8> @bitcast_i64_v8i8(i64 %a, i64 %b){ +; CHECK-LABEL: bitcast_i64_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %c = add i64 %a, %b + %d = bitcast i64 %c to <8 x i8> + ret <8 x i8> %d +} + +define i64 @bitcast_v4i16_i64(<4 x i16> %a, <4 x i16> %b){ +; CHECK-LABEL: bitcast_v4i16_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %c = add <4 x i16> %a, %b + %d = bitcast <4 x i16> %c to i64 + ret i64 %d +} + +define <4 x i16> @bitcast_i64_v4i16(i64 %a, i64 %b){ +; CHECK-LABEL: bitcast_i64_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %c = add i64 %a, %b + %d = bitcast i64 %c to <4 x i16> + ret <4 x i16> %d +} + +define i64 @bitcast_v2i32_i64(<2 x i32> %a, <2 x i32> %b){ +; CHECK-LABEL: bitcast_v2i32_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %c = add <2 x i32> %a, %b + %d = bitcast <2 x i32> %c to i64 + ret i64 %d +} + +define <2 x i32> @bitcast_i64_v2i32(i64 %a, i64 %b){ +; CHECK-LABEL: bitcast_i64_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %c = add i64 %a, %b + %d = bitcast i64 %c to <2 x i32> + ret <2 x i32> %d +} + +; ===== Legal Vector Types ===== + +define <4 x i16> @bitcast_v2i32_v4i16(<2 x i32> %a, <2 x i32> %b){ +; CHECK-LABEL: bitcast_v2i32_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret + %c = add <2 x i32> %a, %b + %d = bitcast <2 x i32> %c to <4 x i16> + ret <4 x i16> %d +} + +define <4 x i32> @bitcast_v2i64_v4i32(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: bitcast_v2i64_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %c = add <2 x i64> %a, %b + %d = bitcast <2 x i64> %c to <4 x i32> + ret <4 x i32> %d +} + +define <8 x i8> @bitcast_v2i32_v8i8(<2 x i32> %a, <2 x i32> %b){ +; CHECK-LABEL: bitcast_v2i32_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret + %c = add <2 x i32> %a, %b + %d = bitcast <2 x i32> %c to <8 x i8> + ret <8 x i8> %d +} + +define <8 x i16> @bitcast_v2i64_v8i16(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: bitcast_v2i64_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %c = add <2 x i64> %a, %b + %d = bitcast <2 x i64> %c to <8 x i16> + ret <8 x i16> %d +} + +define <16 x i8> @bitcast_v2i64_v16i8(<2 x i64> %a, <2 x i64> %b){ +; CHECK-LABEL: bitcast_v2i64_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %c = add <2 x i64> %a, %b + %d = bitcast <2 x i64> %c to <16 x i8> + ret <16 x i8> %d +} + +define <2 x i32> @bitcast_v4i16_v2i32(<4 x i16> %a, <4 x i16> %b){ +; CHECK-LABEL: bitcast_v4i16_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %c = add <4 x i16> %a, %b + %d = bitcast <4 x i16> %c to <2 x i32> + ret <2 x i32> %d +} + +define <2 x i64> @bitcast_v4i32_v2i64(<4 x i32> %a, <4 x i32> %b){ +; CHECK-LABEL: bitcast_v4i32_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %c = add <4 x i32> %a, %b + %d = bitcast <4 x i32> %c to <2 x i64> + ret <2 x i64> %d +} + +define <8 x i8> @bitcast_v4i16_v8i8(<4 x i16> %a, <4 x i16> %b){ +; CHECK-LABEL: bitcast_v4i16_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %c = add <4 x i16> %a, %b + %d = bitcast <4 x i16> %c to <8 x i8> + ret <8 x i8> %d +} + +define <8 x i16> @bitcast_v4i32_v8i16(<4 x i32> %a, <4 x i32> %b){ +; CHECK-LABEL: bitcast_v4i32_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %c = add <4 x i32> %a, %b + %d = bitcast <4 x i32> %c to <8 x i16> + ret <8 x i16> %d +} + +define <16 x i8> @bitcast_v4i32_v16i8(<4 x i32> %a, <4 x i32> %b){ +; CHECK-LABEL: bitcast_v4i32_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %c = add <4 x i32> %a, %b + %d = bitcast <4 x i32> %c to <16 x i8> + ret <16 x i8> %d +} + +define <2 x i32> @bitcast_v8i8_v2i32(<8 x i8> %a, <8 x i8> %b){ +; CHECK-LABEL: bitcast_v8i8_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %c = add <8 x i8> %a, %b + %d = bitcast <8 x i8> %c to <2 x i32> + ret <2 x i32> %d +} + +define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a, <8 x i16> %b){ +; CHECK-LABEL: bitcast_v8i16_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %c = add <8 x i16> %a, %b + %d = bitcast <8 x i16> %c to <2 x i64> + ret <2 x i64> %d +} + +define <4 x i16> @bitcast_v8i8_v4i16(<8 x i8> %a, <8 x i8> %b){ +; CHECK-LABEL: bitcast_v8i8_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %c = add <8 x i8> %a, %b + %d = bitcast <8 x i8> %c to <4 x i16> + ret <4 x i16> %d +} + +define <4 x i32> @bitcast_v8i16_v4i32(<8 x i16> %a, <8 x i16> %b){ +; CHECK-LABEL: bitcast_v8i16_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %c = add <8 x i16> %a, %b + %d = bitcast <8 x i16> %c to <4 x i32> + ret <4 x i32> %d +} + +define <16 x i8> @bitcast_v8i16_v16i8(<8 x i16> %a, <8 x i16> %b){ +; CHECK-LABEL: bitcast_v8i16_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %c = add <8 x i16> %a, %b + %d = bitcast <8 x i16> %c to <16 x i8> + ret <16 x i8> %d +} + +define <2 x i64> @bitcast_v16i8_v2i64(<16 x i8> %a, <16 x i8> %b){ +; CHECK-LABEL: bitcast_v16i8_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %c = add <16 x i8> %a, %b + %d = bitcast <16 x i8> %c to <2 x i64> + ret <2 x i64> %d +} + +define <4 x i32> @bitcast_v16i8_v4i32(<16 x i8> %a, <16 x i8> %b){ +; CHECK-LABEL: bitcast_v16i8_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %c = add <16 x i8> %a, %b + %d = bitcast <16 x i8> %c to <4 x i32> + ret <4 x i32> %d +} + +define <8 x i16> @bitcast_v16i8_v8i16(<16 x i8> %a, <16 x i8> %b){ +; CHECK-LABEL: bitcast_v16i8_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %c = add <16 x i8> %a, %b + %d = bitcast <16 x i8> %c to <8 x i16> + ret <8 x i16> %d +} + +; ===== Smaller/Larger Width Vectors with Legal Element Sizes ===== + +define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ +; CHECK-LABEL: bitcast_v2i16_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strh w9, [sp, #12] +; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %c = add <2 x i16> %a, %b + %d = bitcast <2 x i16> %c to <4 x i8> + ret <4 x i8> %d +} + +define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ +; CHECK-LABEL: bitcast_v4i8_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: str s0, [sp, #12] +; CHECK-NEXT: ld1 { v0.h }[0], [x8] +; CHECK-NEXT: orr x8, x8, #0x2 +; CHECK-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %c = add <4 x i8> %a, %b + %d = bitcast <4 x i8> %c to <2 x i16> + ret <2 x i16> %d +} + +define <8 x i32> @bitcast_v4i64_v8i32(<4 x i64> %a, <4 x i64> %b){ +; CHECK-LABEL: bitcast_v4i64_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-NEXT: ret + %c = add <4 x i64> %a, %b + %d = bitcast <4 x i64> %c to <8 x i32> + ret <8 x i32> %d +} + +define <16 x i16> @bitcast_v4i64_v16i16(<4 x i64> %a, <4 x i64> %b){ +; CHECK-LABEL: bitcast_v4i64_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-NEXT: ret + %c = add <4 x i64> %a, %b + %d = bitcast <4 x i64> %c to <16 x i16> + ret <16 x i16> %d +} + +define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){ +; CHECK-LABEL: bitcast_v8i32_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ret + %c = add <8 x i32> %a, %b + %d = bitcast <8 x i32> %c to <4 x i64> + ret <4 x i64> %d +} + +define <16 x i16> @bitcast_v8i32_v16i16(<8 x i32> %a, <8 x i32> %b){ +; CHECK-LABEL: bitcast_v8i32_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ret + %c = add <8 x i32> %a, %b + %d = bitcast <8 x i32> %c to <16 x i16> + ret <16 x i16> %d +} + +define <16 x i32> @bitcast_v8i64_v16i32(<8 x i64> %a, <8 x i64> %b){ +; CHECK-LABEL: bitcast_v8i64_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v2.2d, v2.2d, v6.2d +; CHECK-NEXT: add v0.2d, v0.2d, v4.2d +; CHECK-NEXT: add v1.2d, v1.2d, v5.2d +; CHECK-NEXT: add v3.2d, v3.2d, v7.2d +; CHECK-NEXT: ret + %c = add <8 x i64> %a, %b + %d = bitcast <8 x i64> %c to <16 x i32> + ret <16 x i32> %d +} + +define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){ +; CHECK-LABEL: bitcast_v16i16_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ret + %c = add <16 x i16> %a, %b + %d = bitcast <16 x i16> %c to <4 x i64> + ret <4 x i64> %d +} + +define <8 x i32> @bitcast_v16i16_v8i32(<16 x i16> %a, <16 x i16> %b){ +; CHECK-LABEL: bitcast_v16i16_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ret + %c = add <16 x i16> %a, %b + %d = bitcast <16 x i16> %c to <8 x i32> + ret <8 x i32> %d +} + +define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){ +; CHECK-LABEL: bitcast_v16i32_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-NEXT: add v3.4s, v3.4s, v7.4s +; CHECK-NEXT: ret + %c = add <16 x i32> %a, %b + %d = bitcast <16 x i32> %c to <8 x i64> + ret <8 x i64> %d +} + +; ===== Vectors with Non-Pow 2 Widths ===== + +define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){ +; CHECK-LABEL: bitcast_v3i32_v6i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %c = add <3 x i32> %a, %b + %d = bitcast <3 x i32> %c to <6 x i16> + ret <6 x i16> %d +} From 7242896233635e553694507e6584decb43ee4a16 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 21 Feb 2024 09:31:29 +0000 Subject: [PATCH 070/351] [Flang] Attempt to fix Nan handling in Minloc/Maxloc intrinsic simplification (#82313) In certain case "extreme" values like Nan, Inf and 0xffffffff could lead to generating different code via the inline-generated intrinsics vs the versions in the runtimes (and other compilers like gfortran). There are some examples I was using for testing in https://godbolt.org/z/x4EfqEss5. This changes the generation for the intrinsics to be more like the runtimes, using a condition that is similar to: isFirst || (prev != prev && elem == elem) || elem < prev The middle part is only used for floating point operations, and checks if the values are Nan. This should then hopefully make the logic closer to - return the first element with the lowest value, with Nans ignored unless there are only Nans. The initial limit value for floats are also changed from the largest float to Inf, to make sure it is handled correctly. The integer reductions are also changed to use a similar scheme to make sure they work with masked values. This means that the preamble after the loop can be removed. --- .../Transforms/OptimizedBufferization.cpp | 25 +++- .../Transforms/SimplifyIntrinsics.cpp | 59 +++------ flang/test/HLFIR/maxloc-elemental.fir | 34 +++--- flang/test/HLFIR/minloc-elemental.fir | 70 +++++------ flang/test/Transforms/simplifyintrinsics.fir | 114 +++++++----------- 5 files changed, 136 insertions(+), 166 deletions(-) diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp index c2512c7df32f4..685c73d676257 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp @@ -852,9 +852,8 @@ class MinMaxlocElementalConversion : public mlir::OpRewritePattern { mlir::Type elementType) { if (auto ty = elementType.dyn_cast()) { const llvm::fltSemantics &sem = ty.getFloatSemantics(); - return builder.createRealConstant( - loc, elementType, - llvm::APFloat::getLargest(sem, /*Negative=*/isMax)); + llvm::APFloat limit = llvm::APFloat::getInf(sem, /*Negative=*/isMax); + return builder.createRealConstant(loc, elementType, limit); } unsigned bits = elementType.getIntOrFloatBitWidth(); int64_t limitInt = @@ -895,7 +894,7 @@ class MinMaxlocElementalConversion : public mlir::OpRewritePattern { // Set flag that mask was true at some point mlir::Value flagSet = builder.createIntegerConstant( loc, mlir::cast(flagRef.getType()).getEleTy(), 1); - builder.create(loc, flagSet, flagRef); + mlir::Value isFirst = builder.create(loc, flagRef); mlir::Value addr = hlfir::getElementAt(loc, builder, hlfir::Entity{array}, oneBasedIndices); mlir::Value elem = builder.create(loc, addr); @@ -903,11 +902,22 @@ class MinMaxlocElementalConversion : public mlir::OpRewritePattern { // Compare with the max reduction value mlir::Value cmp; if (elementType.isa()) { + // For FP reductions we want the first smallest value to be used, that + // is not NaN. A OGL/OLT condition will usually work for this unless all + // the values are Nan or Inf. This follows the same logic as + // NumericCompare for Minloc/Maxlox in extrema.cpp. cmp = builder.create( loc, isMax ? mlir::arith::CmpFPredicate::OGT : mlir::arith::CmpFPredicate::OLT, elem, reduction); + + mlir::Value cmpNan = builder.create( + loc, mlir::arith::CmpFPredicate::UNE, reduction, reduction); + mlir::Value cmpNan2 = builder.create( + loc, mlir::arith::CmpFPredicate::OEQ, elem, elem); + cmpNan = builder.create(loc, cmpNan, cmpNan2); + cmp = builder.create(loc, cmp, cmpNan); } else if (elementType.isa()) { cmp = builder.create( loc, @@ -918,11 +928,18 @@ class MinMaxlocElementalConversion : public mlir::OpRewritePattern { llvm_unreachable("unsupported type"); } + // The condition used for the loop is isFirst || . + isFirst = builder.create(loc, cmp.getType(), isFirst); + isFirst = builder.create( + loc, isFirst, builder.createIntegerConstant(loc, cmp.getType(), 1)); + cmp = builder.create(loc, cmp, isFirst); + // Set the new coordinate to the result fir::IfOp ifOp = builder.create(loc, elementType, cmp, /*withElseRegion*/ true); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + builder.create(loc, flagSet, flagRef); mlir::Type resultElemTy = hlfir::getFortranElementType(resultArr.getType()); mlir::Type returnRefTy = builder.getRefType(resultElemTy); diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp index 86343e23c6e5d..f483651a68dc1 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -649,42 +649,6 @@ void fir::genMinMaxlocReductionLoop( reductionVal = ifOp.getResult(0); } } - - // Check for case where array was full of max values. - // flag will be 0 if mask was never true, 1 if mask was true as some point, - // this is needed to avoid catching cases where we didn't access any elements - // e.g. mask=.FALSE. - mlir::Value flagValue = - builder.create(loc, resultElemType, flagRef); - mlir::Value flagCmp = builder.create( - loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet); - fir::IfOp ifMaskTrueOp = - builder.create(loc, flagCmp, /*withElseRegion=*/false); - builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front()); - - mlir::Value testInit = initVal(builder, loc, elementType); - fir::IfOp ifMinSetOp; - if (elementType.isa()) { - mlir::Value cmp = builder.create( - loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal); - ifMinSetOp = builder.create(loc, cmp, - /*withElseRegion*/ false); - } else { - mlir::Value cmp = builder.create( - loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal); - ifMinSetOp = builder.create(loc, cmp, - /*withElseRegion*/ false); - } - builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front()); - - // Load output array with 1s instead of 0s - for (unsigned int i = 0; i < rank; ++i) { - mlir::Value index = builder.createIntegerConstant(loc, idxTy, i); - mlir::Value resultElemAddr = - getAddrFn(builder, loc, resultElemType, resultArr, index); - builder.create(loc, flagSet, resultElemAddr); - } - builder.setInsertionPointAfter(ifMaskTrueOp); } static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, @@ -697,8 +661,8 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, mlir::Type elementType) { if (auto ty = elementType.dyn_cast()) { const llvm::fltSemantics &sem = ty.getFloatSemantics(); - return builder.createRealConstant( - loc, elementType, llvm::APFloat::getLargest(sem, /*Negative=*/isMax)); + llvm::APFloat limit = llvm::APFloat::getInf(sem, /*Negative=*/isMax); + return builder.createRealConstant(loc, elementType, limit); } unsigned bits = elementType.getIntOrFloatBitWidth(); int64_t initValue = (isMax ? llvm::APInt::getSignedMinValue(bits) @@ -770,7 +734,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, // Set flag that mask was true at some point mlir::Value flagSet = builder.createIntegerConstant( loc, mlir::cast(flagRef.getType()).getEleTy(), 1); - builder.create(loc, flagSet, flagRef); + mlir::Value isFirst = builder.create(loc, flagRef); mlir::Type eleRefTy = builder.getRefType(elementType); mlir::Value addr = builder.create(loc, eleRefTy, array, indices); @@ -778,11 +742,22 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, mlir::Value cmp; if (elementType.isa()) { + // For FP reductions we want the first smallest value to be used, that + // is not NaN. A OGL/OLT condition will usually work for this unless all + // the values are Nan or Inf. This follows the same logic as + // NumericCompare for Minloc/Maxlox in extrema.cpp. cmp = builder.create( loc, isMax ? mlir::arith::CmpFPredicate::OGT : mlir::arith::CmpFPredicate::OLT, elem, reduction); + + mlir::Value cmpNan = builder.create( + loc, mlir::arith::CmpFPredicate::UNE, reduction, reduction); + mlir::Value cmpNan2 = builder.create( + loc, mlir::arith::CmpFPredicate::OEQ, elem, elem); + cmpNan = builder.create(loc, cmpNan, cmpNan2); + cmp = builder.create(loc, cmp, cmpNan); } else if (elementType.isa()) { cmp = builder.create( loc, @@ -793,10 +768,16 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder, llvm_unreachable("unsupported type"); } + // The condition used for the loop is isFirst || . + isFirst = builder.create(loc, cmp.getType(), isFirst); + isFirst = builder.create( + loc, isFirst, builder.createIntegerConstant(loc, cmp.getType(), 1)); + cmp = builder.create(loc, cmp, isFirst); fir::IfOp ifOp = builder.create(loc, elementType, cmp, /*withElseRegion*/ true); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + builder.create(loc, flagSet, flagRef); mlir::Type resultElemTy = hlfir::getFortranElementType(resultArr.getType()); mlir::Type returnRefTy = builder.getRefType(resultElemTy); mlir::IndexType idxTy = builder.getIndexType(); diff --git a/flang/test/HLFIR/maxloc-elemental.fir b/flang/test/HLFIR/maxloc-elemental.fir index b4a3ca0d86068..c97117dd10de1 100644 --- a/flang/test/HLFIR/maxloc-elemental.fir +++ b/flang/test/HLFIR/maxloc-elemental.fir @@ -23,6 +23,7 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} return } // CHECK-LABEL: func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"}, %arg1: !fir.ref {fir.bindc_name = "val"}, %arg2: !fir.box> {fir.bindc_name = "m"}) { +// CHECK-NEXT: %true = arith.constant true // CHECK-NEXT: %c-2147483648_i32 = arith.constant -2147483648 : i32 // CHECK-NEXT: %c1_i32 = arith.constant 1 : i32 // CHECK-NEXT: %c0 = arith.constant 0 : index @@ -45,14 +46,18 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: %[[V16:.*]] = fir.load %[[V15]] : !fir.ref // CHECK-NEXT: %[[V17:.*]] = arith.cmpi sge, %[[V16]], %[[V4]] : i32 // CHECK-NEXT: %[[V18:.*]] = fir.if %[[V17]] -> (i32) { -// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref +// CHECK-NEXT: %[[ISFIRST:.*]] = fir.load %[[V0]] : !fir.ref // CHECK-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %[[V1]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: %[[SUB:.*]] = arith.subi %[[DIMS]]#0, %c1 : index // CHECK-NEXT: %[[ADD:.*]] = arith.addi %[[V14]], %[[SUB]] : index // CHECK-NEXT: %[[V19:.*]] = hlfir.designate %[[V1]]#0 (%[[ADD]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V20:.*]] = fir.load %[[V19]] : !fir.ref // CHECK-NEXT: %[[V21:.*]] = arith.cmpi sgt, %[[V20]], %arg4 : i32 -// CHECK-NEXT: %[[V22:.*]] = fir.if %[[V21]] -> (i32) { +// CHECK-NEXT: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK-NEXT: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK-NEXT: %[[ORCOND:.*]] = arith.ori %[[V21]], %[[ISFIRSTNOT]] : i1 +// CHECK-NEXT: %[[V22:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref // CHECK-NEXT: %[[V23:.*]] = hlfir.designate %[[RES]] (%c1) : (!fir.ref>, index) -> !fir.ref // CHECK-NEXT: %[[V24:.*]] = fir.convert %[[V14]] : (index) -> i32 // CHECK-NEXT: fir.store %[[V24]] to %[[V23]] : !fir.ref @@ -66,15 +71,6 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[V12:.*]] = fir.load %[[V0]] : !fir.ref -// CHECK-NEXT: %[[V13:.*]] = arith.cmpi eq, %[[V12]], %c1_i32 : i32 -// CHECK-NEXT: fir.if %[[V13]] { -// CHECK-NEXT: %[[V14:.*]] = arith.cmpi eq, %[[V11]], %c-2147483648_i32 : i32 -// CHECK-NEXT: fir.if %[[V14]] { -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[RES]] (%c1) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: fir.store %c1_i32 to %[[V15]] : !fir.ref -// CHECK-NEXT: } -// CHECK-NEXT: } // CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { // CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref @@ -110,21 +106,29 @@ func.func @_QPtest_float(%arg0: !fir.box> {fir.bindc_name = "a return } // CHECK-LABEL: _QPtest_float -// CHECK: %cst = arith.constant -3.40282347E+38 : f32 +// CHECK: %cst = arith.constant 0xFF800000 : f32 // CHECK: %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) { // CHECK-NEXT: %[[V14:.*]] = arith.addi %arg3, %c1 : index // CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V16:.*]] = fir.load %[[V15]] : !fir.ref // CHECK-NEXT: %[[V17:.*]] = arith.cmpf oge, %[[V16]], %[[V4:.*]] : f32 // CHECK-NEXT: %[[V18:.*]] = fir.if %[[V17]] -> (f32) { -// CHECK-NEXT: fir.store %c1_i32 to %[[V0:.*]] : !fir.ref +// CHECK-NEXT: %[[ISFIRST:.*]] = fir.load %[[V0:.*]] : !fir.ref // CHECK-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %2#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: %[[SUB:.*]] = arith.subi %[[DIMS]]#0, %c1 : index // CHECK-NEXT: %[[ADD:.*]] = arith.addi %[[V14]], %[[SUB]] : index // CHECK-NEXT: %[[V19:.*]] = hlfir.designate %[[V1]]#0 (%[[ADD]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V20:.*]] = fir.load %[[V19]] : !fir.ref -// CHECK-NEXT: %[[V21:.*]] = arith.cmpf ogt, %[[V20]], %arg4 fastmath : f32 -// CHECK-NEXT: %[[V22:.*]] = fir.if %[[V21]] -> (f32) { +// CHECK-NEXT: %[[NEW_MIN:.*]] = arith.cmpf ogt, %[[V20]], %arg4 fastmath : f32 +// CHECK-NEXT: %[[CONDRED:.*]] = arith.cmpf une, %arg4, %arg4 fastmath : f32 +// CHECK-NEXT: %[[CONDELEM:.*]] = arith.cmpf oeq, %[[V20]], %[[V20]] fastmath : f32 +// CHECK-NEXT: %[[ANDCOND:.*]] = arith.andi %[[CONDRED]], %[[CONDELEM]] : i1 +// CHECK-NEXT: %[[NEW_MIN2:.*]] = arith.ori %[[NEW_MIN]], %[[ANDCOND]] : i1 +// CHECK-NEXT: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK-NEXT: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK-NEXT: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN2]], %[[ISFIRSTNOT]] : i1 +// CHECK-NEXT: %[[V22:.*]] = fir.if %[[ORCOND]] -> (f32) { +// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref // CHECK-NEXT: %[[V23:.*]] = hlfir.designate %{{.}} (%c1) : (!fir.ref>, index) -> !fir.ref // CHECK-NEXT: %[[V24:.*]] = fir.convert %[[V14]] : (index) -> i32 // CHECK-NEXT: fir.store %[[V24]] to %[[V23]] : !fir.ref diff --git a/flang/test/HLFIR/minloc-elemental.fir b/flang/test/HLFIR/minloc-elemental.fir index 5cc608b65be8b..58cfe3ea01279 100644 --- a/flang/test/HLFIR/minloc-elemental.fir +++ b/flang/test/HLFIR/minloc-elemental.fir @@ -23,6 +23,7 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} return } // CHECK-LABEL: func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"}, %arg1: !fir.ref {fir.bindc_name = "val"}, %arg2: !fir.box> {fir.bindc_name = "m"}) { +// CHECK-NEXT: %true = arith.constant true // CHECK-NEXT: %c2147483647_i32 = arith.constant 2147483647 : i32 // CHECK-NEXT: %c1_i32 = arith.constant 1 : i32 // CHECK-NEXT: %c0 = arith.constant 0 : index @@ -45,14 +46,18 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: %[[V16:.*]] = fir.load %[[V15]] : !fir.ref // CHECK-NEXT: %[[V17:.*]] = arith.cmpi sge, %[[V16]], %[[V4]] : i32 // CHECK-NEXT: %[[V18:.*]] = fir.if %[[V17]] -> (i32) { -// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref +// CHECK-NEXT: %[[ISFIRST:.*]] = fir.load %[[V0]] : !fir.ref // CHECK-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %[[V1]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: %[[SUB:.*]] = arith.subi %[[DIMS]]#0, %c1 : index // CHECK-NEXT: %[[ADD:.*]] = arith.addi %[[V14]], %[[SUB]] : index // CHECK-NEXT: %[[V19:.*]] = hlfir.designate %[[V1]]#0 (%[[ADD]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V20:.*]] = fir.load %[[V19]] : !fir.ref // CHECK-NEXT: %[[V21:.*]] = arith.cmpi slt, %[[V20]], %arg4 : i32 -// CHECK-NEXT: %[[V22:.*]] = fir.if %[[V21]] -> (i32) { +// CHECK-NEXT: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK-NEXT: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK-NEXT: %[[ORCOND:.*]] = arith.ori %[[V21]], %[[ISFIRSTNOT]] : i1 +// CHECK-NEXT: %[[V22:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref // CHECK-NEXT: %[[V23:.*]] = hlfir.designate %[[RES]] (%c1) : (!fir.ref>, index) -> !fir.ref // CHECK-NEXT: %[[V24:.*]] = fir.convert %[[V14]] : (index) -> i32 // CHECK-NEXT: fir.store %[[V24]] to %[[V23]] : !fir.ref @@ -66,15 +71,6 @@ func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "array"} // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[V12:.*]] = fir.load %[[V0]] : !fir.ref -// CHECK-NEXT: %[[V13:.*]] = arith.cmpi eq, %[[V12]], %c1_i32 : i32 -// CHECK-NEXT: fir.if %[[V13]] { -// CHECK-NEXT: %[[V14:.*]] = arith.cmpi eq, %[[V11]], %c2147483647_i32 : i32 -// CHECK-NEXT: fir.if %[[V14]] { -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[RES]] (%c1) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: fir.store %c1_i32 to %[[V15]] : !fir.ref -// CHECK-NEXT: } -// CHECK-NEXT: } // CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { // CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref @@ -109,6 +105,7 @@ func.func @_QPtest_kind2(%arg0: !fir.box> {fir.bindc_name = "a return } // CHECK-LABEL: func.func @_QPtest_kind2(%arg0: !fir.box> {fir.bindc_name = "array"}, %arg1: !fir.ref {fir.bindc_name = "val"}, %arg2: !fir.box> {fir.bindc_name = "m"}) { +// CHECK-NEXT: %true = arith.constant true // CHECK-NEXT: %c2147483647_i32 = arith.constant 2147483647 : i32 // CHECK-NEXT: %c1_i16 = arith.constant 1 : i16 // CHECK-NEXT: %c0 = arith.constant 0 : index @@ -131,14 +128,18 @@ func.func @_QPtest_kind2(%arg0: !fir.box> {fir.bindc_name = "a // CHECK-NEXT: %[[V16:.*]] = fir.load %[[V15]] : !fir.ref // CHECK-NEXT: %[[V17:.*]] = arith.cmpi sge, %[[V16]], %[[V4]] : i32 // CHECK-NEXT: %[[V18:.*]] = fir.if %[[V17]] -> (i32) { -// CHECK-NEXT: fir.store %c1_i16 to %[[V0]] : !fir.ref +// CHECK-NEXT: %[[ISFIRST:.*]] = fir.load %[[V0]] : !fir.ref // CHECK-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %[[V1]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: %[[SUB:.*]] = arith.subi %[[DIMS]]#0, %c1 : index // CHECK-NEXT: %[[ADD:.*]] = arith.addi %[[V14]], %[[SUB]] : index // CHECK-NEXT: %[[V19:.*]] = hlfir.designate %[[V1]]#0 (%[[ADD]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V20:.*]] = fir.load %[[V19]] : !fir.ref // CHECK-NEXT: %[[V21:.*]] = arith.cmpi slt, %[[V20]], %arg4 : i32 -// CHECK-NEXT: %[[V22:.*]] = fir.if %[[V21]] -> (i32) { +// CHECK-NEXT: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i16) -> i1 +// CHECK-NEXT: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK-NEXT: %[[ORCOND:.*]] = arith.ori %[[V21]], %[[ISFIRSTNOT]] : i1 +// CHECK-NEXT: %[[V22:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK-NEXT: fir.store %c1_i16 to %[[V0]] : !fir.ref // CHECK-NEXT: %[[V23:.*]] = hlfir.designate %[[RES]] (%c1) : (!fir.ref>, index) -> !fir.ref // CHECK-NEXT: %[[V24:.*]] = fir.convert %[[V14]] : (index) -> i16 // CHECK-NEXT: fir.store %[[V24]] to %[[V23]] : !fir.ref @@ -152,15 +153,6 @@ func.func @_QPtest_kind2(%arg0: !fir.box> {fir.bindc_name = "a // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V18]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[V12:.*]] = fir.load %[[V0]] : !fir.ref -// CHECK-NEXT: %[[V13:.*]] = arith.cmpi eq, %[[V12]], %c1_i16 : i16 -// CHECK-NEXT: fir.if %[[V13]] { -// CHECK-NEXT: %[[V14:.*]] = arith.cmpi eq, %[[V11]], %c2147483647_i32 : i32 -// CHECK-NEXT: fir.if %[[V14]] { -// CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[RES]] (%c1) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: fir.store %c1_i16 to %[[V15]] : !fir.ref -// CHECK-NEXT: } -// CHECK-NEXT: } // CHECK-NEXT: %[[BD:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: fir.do_loop %arg3 = %c1 to %[[BD]]#1 step %c1 unordered { // CHECK-NEXT: %[[V13:.*]] = hlfir.designate %[[RES]] (%arg3) : (!fir.ref>, index) -> !fir.ref @@ -206,6 +198,7 @@ func.func @_QPtest_kind2_convert(%arg0: !fir.box> {fir.bindc_n // CHECK-LABEL: _QPtest_kind2_convert // CHECK-SAME: (%arg0: !fir.box> {fir.bindc_name = "array"}, %arg1: !fir.ref {fir.bindc_name = "val"}, %arg2: !fir.box> {fir.bindc_name = "m"}) { // CHECK-NEXT: %false = arith.constant false +// CHECK-NEXT: %true = arith.constant true // CHECK-NEXT: %c2147483647_i32 = arith.constant 2147483647 : i32 // CHECK-NEXT: %c1_i16 = arith.constant 1 : i16 // CHECK-NEXT: %c0 = arith.constant 0 : index @@ -228,14 +221,18 @@ func.func @_QPtest_kind2_convert(%arg0: !fir.box> {fir.bindc_n // CHECK-NEXT: %[[V17:.*]] = fir.load %[[V16]] : !fir.ref // CHECK-NEXT: %[[V18:.*]] = arith.cmpi sge, %[[V17]], %[[V5]] : i32 // CHECK-NEXT: %[[V19:.*]] = fir.if %[[V18]] -> (i32) { -// CHECK-NEXT: fir.store %c1_i16 to %[[V0]] : !fir.ref +// CHECK-NEXT: %[[ISFIRST:.*]] = fir.load %[[V0]] : !fir.ref // CHECK-NEXT: %[[V20:.*]]:3 = fir.box_dims %[[V2]]#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: %[[V21:.*]] = arith.subi %[[V20]]#0, %c1 : index // CHECK-NEXT: %[[V22:.*]] = arith.addi %[[V15]], %[[V21]] : index // CHECK-NEXT: %[[V23:.*]] = hlfir.designate %[[V2]]#0 (%[[V22]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V24:.*]] = fir.load %[[V23]] : !fir.ref // CHECK-NEXT: %[[V25:.*]] = arith.cmpi slt, %[[V24]], %arg4 : i32 -// CHECK-NEXT: %[[V26:.*]] = fir.if %[[V25]] -> (i32) { +// CHECK-NEXT: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i16) -> i1 +// CHECK-NEXT: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK-NEXT: %[[ORCOND:.*]] = arith.ori %[[V25]], %[[ISFIRSTNOT]] : i1 +// CHECK-NEXT: %[[V26:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK-NEXT: fir.store %c1_i16 to %[[V0]] : !fir.ref // CHECK-NEXT: %[[V27:.*]] = hlfir.designate %[[V1]] (%c1) : (!fir.ref>, index) -> !fir.ref // CHECK-NEXT: %[[V28:.*]] = fir.convert %[[V15]] : (index) -> i16 // CHECK-NEXT: fir.store %[[V28]] to %[[V27]] : !fir.ref @@ -249,15 +246,6 @@ func.func @_QPtest_kind2_convert(%arg0: !fir.box> {fir.bindc_n // CHECK-NEXT: } // CHECK-NEXT: fir.result %[[V19]] : i32 // CHECK-NEXT: } -// CHECK-NEXT: %[[V10:.*]] = fir.load %[[V0]] : !fir.ref -// CHECK-NEXT: %[[V11:.*]] = arith.cmpi eq, %[[V10]], %c1_i16 : i16 -// CHECK-NEXT: fir.if %[[V11]] { -// CHECK-NEXT: %[[V15]] = arith.cmpi eq, %[[V9]], %c2147483647_i32 : i32 -// CHECK-NEXT: fir.if %[[V15]] { -// CHECK-NEXT: %[[V16]] = hlfir.designate %[[V1]] (%c1) : (!fir.ref>, index) -> !fir.ref -// CHECK-NEXT: fir.store %c1_i16 to %[[V16]] : !fir.ref -// CHECK-NEXT: } -// CHECK-NEXT: } // CHECK-NEXT: %[[V12:.*]] = hlfir.as_expr %[[V1]] move %false : (!fir.ref>, i1) -> !hlfir.expr<1xi16> // CHECK-NEXT: %[[V13:.*]] = fir.shape %c1 : (index) -> !fir.shape<1> // CHECK-NEXT: %[[V14:.*]] = hlfir.elemental %[[V13]] unordered : (!fir.shape<1>) -> !hlfir.expr { @@ -295,21 +283,29 @@ func.func @_QPtest_float(%arg0: !fir.box> {fir.bindc_name = "a return } // CHECK-LABEL: _QPtest_float -// CHECK: %cst = arith.constant 3.40282347E+38 : f32 +// CHECK: %cst = arith.constant 0x7F800000 : f32 // CHECK: %[[V11:.*]] = fir.do_loop %arg3 = %c0 to %[[V10:.*]] step %c1 iter_args(%arg4 = %cst) -> (f32) { // CHECK-NEXT: %[[V14:.*]] = arith.addi %arg3, %c1 : index // CHECK-NEXT: %[[V15:.*]] = hlfir.designate %[[V1:.*]]#0 (%[[V14]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V16:.*]] = fir.load %[[V15]] : !fir.ref // CHECK-NEXT: %[[V17:.*]] = arith.cmpf oge, %[[V16]], %[[V4:.*]] : f32 // CHECK-NEXT: %[[V18:.*]] = fir.if %[[V17]] -> (f32) { -// CHECK-NEXT: fir.store %c1_i32 to %[[V0:.*]] : !fir.ref +// CHECK-NEXT: %[[ISFIRST:.*]] = fir.load %[[V0:.*]] : !fir.ref // CHECK-NEXT: %[[DIMS:.*]]:3 = fir.box_dims %2#0, %c0 : (!fir.box>, index) -> (index, index, index) // CHECK-NEXT: %[[SUB:.*]] = arith.subi %[[DIMS]]#0, %c1 : index // CHECK-NEXT: %[[ADD:.*]] = arith.addi %[[V14]], %[[SUB]] : index // CHECK-NEXT: %[[V19:.*]] = hlfir.designate %[[V1]]#0 (%[[ADD]]) : (!fir.box>, index) -> !fir.ref // CHECK-NEXT: %[[V20:.*]] = fir.load %[[V19]] : !fir.ref -// CHECK-NEXT: %[[V21:.*]] = arith.cmpf olt, %[[V20]], %arg4 fastmath : f32 -// CHECK-NEXT: %[[V22:.*]] = fir.if %[[V21]] -> (f32) { +// CHECK-NEXT: %[[NEW_MIN:.*]] = arith.cmpf olt, %[[V20]], %arg4 fastmath : f32 +// CHECK-NEXT: %[[CONDRED:.*]] = arith.cmpf une, %arg4, %arg4 fastmath : f32 +// CHECK-NEXT: %[[CONDELEM:.*]] = arith.cmpf oeq, %[[V20]], %[[V20]] fastmath : f32 +// CHECK-NEXT: %[[ANDCOND:.*]] = arith.andi %[[CONDRED]], %[[CONDELEM]] : i1 +// CHECK-NEXT: %[[NEW_MIN2:.*]] = arith.ori %[[NEW_MIN]], %[[ANDCOND]] : i1 +// CHECK-NEXT: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK-NEXT: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK-NEXT: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN2]], %[[ISFIRSTNOT]] : i1 +// CHECK-NEXT: %[[V22:.*]] = fir.if %[[ORCOND]] -> (f32) { +// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref // CHECK-NEXT: %[[V23:.*]] = hlfir.designate %{{.}} (%c1) : (!fir.ref>, index) -> !fir.ref // CHECK-NEXT: %[[V24:.*]] = fir.convert %[[V14]] : (index) -> i32 // CHECK-NEXT: fir.store %[[V24]] to %[[V23]] : !fir.ref diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir index cd059cc797a3f..ce9f2dbd3e0fb 100644 --- a/flang/test/Transforms/simplifyintrinsics.fir +++ b/flang/test/Transforms/simplifyintrinsics.fir @@ -1780,11 +1780,15 @@ func.func @_QPtestminloc_works1d(%arg0: !fir.ref> {fir.bindc_ // CHECK: %[[MASK_IF_ITEM:.*]] = fir.convert %[[MASK_ITEMVAL]] : (!fir.logical<4>) -> i1 // CHECK: %[[IF_MASK:.*]] = fir.if %[[MASK_IF_ITEM]] -> (i32) { // CHECK: %[[FLAG_SET2:.*]] = arith.constant 1 : i32 -// CHECK: fir.store %[[FLAG_SET2]] to %[[FLAG_ALLOC]] : !fir.ref +// CHECK: %[[ISFIRST:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[INARR_ITEM:.*]] = fir.coordinate_of %[[BOX_INARR]], %[[ITER]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[INARR_ITEMVAL:.*]] = fir.load %[[INARR_ITEM]] : !fir.ref // CHECK: %[[NEW_MIN:.*]] = arith.cmpi slt, %[[INARR_ITEMVAL]], %[[MIN]] : i32 -// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[NEW_MIN]] -> (i32) { +// CHECK: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN]], %[[ISFIRSTNOT]] : i1 +// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK: fir.store %[[FLAG_SET2]] to %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[ONE:.*]] = arith.constant 1 : i32 // CHECK: %[[OUTARR_IDX:.*]] = arith.constant 0 : index // CHECK: %[[OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref @@ -1801,17 +1805,6 @@ func.func @_QPtestminloc_works1d(%arg0: !fir.ref> {fir.bindc_ // CHECK: } // CHECK: fir.result %[[IF_MASK:.*]] : i32 // CHECK: } -// CHECK: %[[FLAG_VAL:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref -// CHECK: %[[FLAG_WAS_SET:.*]] = arith.cmpi eq, %[[FLAG_VAL]], %[[FLAG_SET]] : i32 -// CHECK: fir.if %[[FLAG_WAS_SET]] { -// CHECK: %[[TEST_MAX:.*]] = arith.constant 2147483647 : i32 -// CHECK: %[[INIT_NOT_CHANGED:.*]] = arith.cmpi eq, %[[TEST_MAX]], %[[DO_LOOP:.*]] : i32 -// CHECK: fir.if %[[INIT_NOT_CHANGED]] { -// CHECK: %[[FLAG_OUTARR_IDX:.*]] = arith.constant 0 : index -// CHECK: %[[FLAG_OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[FLAG_OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref -// CHECK: fir.store %[[FLAG_SET]] to %[[FLAG_OUTARR_ITEM]] : !fir.ref -// CHECK: } -// CHECK: } // CHECK: %[[REF_BOX_OUTARR:.*]] = fir.convert %[[REF_BOX_OUTARR_NONE]] : (!fir.ref>) -> !fir.ref>>> // CHECK: fir.store %[[BOX_OUTARR]] to %[[REF_BOX_OUTARR]] : !fir.ref>>> // CHECK: return @@ -1903,10 +1896,16 @@ func.func @_QPtestminloc_works2d_nomask(%arg0: !fir.ref> { // CHECK: %[[EXTENT1:.*]] = arith.subi %[[DIMS1]]#1, %[[C_INDEX1]] : index // CHECK: %[[DOLOOP0:.*]] = fir.do_loop %[[ITER0:.*]] = %[[C_INDEX0]] to %[[EXTENT1]] step %[[C_INDEX1]] iter_args(%[[MIN0:.*]] = %[[MAX]]) -> (i32) { // CHECK: %[[DOLOOP1:.*]] = fir.do_loop %[[ITER1:.*]] = %[[C_INDEX0]] to %[[EXTENT0]] step %[[C_INDEX1]] iter_args(%[[MIN1:.*]] = %[[MIN0]]) -> (i32) { +// CHECK: %[[FLAG_SET2:.*]] = arith.constant 1 : i64 +// CHECK: %[[ISFIRST:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[INARR_ITEM:.*]] = fir.coordinate_of %[[BOX_INARR]], %[[ITER1]], %[[ITER0]] : (!fir.box>, index, index) -> !fir.ref // CHECK: %[[INARR_ITEMVAL:.*]] = fir.load %[[INARR_ITEM]] : !fir.ref // CHECK: %[[NEW_MIN:.*]] = arith.cmpi slt, %[[INARR_ITEMVAL]], %[[MIN1]] : i32 -// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[NEW_MIN]] -> (i32) { +// CHECK: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i64) -> i1 +// CHECK: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN]], %[[ISFIRSTNOT]] : i1 +// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK: fir.store %[[FLAG_SET2]] to %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[ONE:.*]] = arith.constant 1 : i64 // CHECK: %[[OUTARR_IDX0:.*]] = arith.constant 0 : index // CHECK: %[[OUTARR_ITEM0:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[OUTARR_IDX0]] : (!fir.box>>, index) -> !fir.ref @@ -1926,20 +1925,6 @@ func.func @_QPtestminloc_works2d_nomask(%arg0: !fir.ref> { // CHECK: } // CHECK: fir.result %[[DOLOOP1:.*]] : i32 // CHECK: } -// CHECK: %[[FLAG_VAL:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref -// CHECK: %[[FLAG_WAS_SET:.*]] = arith.cmpi eq, %[[FLAG_VAL]], %[[FLAG_SET]] : i64 -// CHECK: fir.if %[[FLAG_WAS_SET]] { -// CHECK: %[[TEST_MAX:.*]] = arith.constant 2147483647 : i32 -// CHECK: %[[INIT_NOT_CHANGED:.*]] = arith.cmpi eq, %[[TEST_MAX]], %[[DO_LOOP:.*]] : i32 -// CHECK: fir.if %[[INIT_NOT_CHANGED]] { -// CHECK: %[[FLAG_OUTARR_IDX0:.*]] = arith.constant 0 : index -// CHECK: %[[FLAG_OUTARR_ITEM0:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[FLAG_OUTARR_IDX0]] : (!fir.box>>, index) -> !fir.ref -// CHECK: fir.store %[[FLAG_SET]] to %[[FLAG_OUTARR_ITEM0]] : !fir.ref -// CHECK: %[[FLAG_OUTARR_IDX1:.*]] = arith.constant 1 : index -// CHECK: %[[FLAG_OUTARR_ITEM1:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[FLAG_OUTARR_IDX1]] : (!fir.box>>, index) -> !fir.ref -// CHECK: fir.store %[[FLAG_SET]] to %[[FLAG_OUTARR_ITEM1]] : !fir.ref>) -> !fir.ref>>> // CHECK: fir.store %[[BOX_OUTARR]] to %[[REF_BOX_OUTARR]] : !fir.ref>>> // CHECK: return @@ -2021,16 +2006,25 @@ func.func @_QPtestminloc_works1d_scalarmask_f64(%arg0: !fir.ref -// CHECK: %[[MAX:.*]] = arith.constant 1.7976931348623157E+308 : f64 +// CHECK: %[[MAX:.*]] = arith.constant 0x7FF0000000000000 : f64 // CHECK: %[[C_INDEX1:.*]] = arith.constant 1 : index // CHECK: %[[DIM_INDEX:.*]] = arith.constant 0 : index // CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[BOX_INARR]], %[[DIM_INDEX]] : (!fir.box>, index) -> (index, index, index) // CHECK: %[[EXTENT:.*]] = arith.subi %[[DIMS]]#1, %[[C_INDEX1]] : index // CHECK: %[[DOLOOP:.*]] = fir.do_loop %[[ITER:.*]] = %[[C_INDEX0]] to %[[EXTENT]] step %[[C_INDEX1]] iter_args(%[[MIN:.*]] = %[[MAX]]) -> (f64) { +// CHECK: %[[FLAG_SET2:.*]] = arith.constant 1 : i32 +// CHECK: %[[ISFIRST:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[INARR_ITEM:.*]] = fir.coordinate_of %[[BOX_INARR]], %[[ITER]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[INARR_ITEMVAL:.*]] = fir.load %[[INARR_ITEM]] : !fir.ref -// CHECK: %[[NEW_MIN:.*]] = arith.cmpf olt, %[[INARR_ITEMVAL]], %[[MIN]] fastmath<{{.*}}> : f64 -// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[NEW_MIN]] -> (f64) { +// CHECK: %[[NEW_MIN:.*]] = arith.cmpf olt, %[[INARR_ITEMVAL]], %arg4 fastmath : f64 +// CHECK: %[[CONDRED:.*]] = arith.cmpf une, %arg4, %arg4 fastmath : f64 +// CHECK: %[[CONDELEM:.*]] = arith.cmpf oeq, %[[INARR_ITEMVAL]], %[[INARR_ITEMVAL]] fastmath : f64 +// CHECK: %[[ANDCOND:.*]] = arith.andi %[[CONDRED]], %[[CONDELEM]] : i1 +// CHECK: %[[NEW_MIN2:.*]] = arith.ori %[[NEW_MIN]], %[[ANDCOND]] : i1 +// CHECK: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN2]], %[[ISFIRSTNOT]] : i1 +// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[ORCOND]] -> (f64) { // CHECK: %[[ONE:.*]] = arith.constant 1 : i32 // CHECK: %[[OUTARR_IDX:.*]] = arith.constant 0 : index // CHECK: %[[OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref @@ -2044,18 +2038,6 @@ func.func @_QPtestminloc_works1d_scalarmask_f64(%arg0: !fir.ref -// CHECK: %[[FLAG_WAS_SET:.*]] = arith.cmpi eq, %[[FLAG_VAL]], %[[FLAG_CHECK]] : i32 -// CHECK: fir.if %[[FLAG_WAS_SET]] { -// CHECK: %[[TEST_MAX:.*]] = arith.constant 1.7976931348623157E+308 : f64 -// CHECK: %[[INIT_NOT_CHANGED:.*]] = arith.cmpf oeq, %[[TEST_MAX]], %[[INIT_RES:.*]] fastmath<{{.*}}> : f64 -// CHECK: fir.if %[[INIT_NOT_CHANGED]] { -// CHECK: %[[FLAG_OUTARR_IDX:.*]] = arith.constant 0 : index -// CHECK: %[[FLAG_OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[FLAG_OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref -// CHECK: fir.store %[[FLAG_CHECK]] to %[[FLAG_OUTARR_ITEM]] : !fir.ref -// CHECK: } -// CHECK: } // CHECK: %[[REF_BOX_OUTARR:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref>>> // CHECK: fir.store %[[BOX_OUTARR]] to %[[REF_BOX_OUTARR]] : !fir.ref>>> // CHECK: return @@ -2444,11 +2426,15 @@ func.func @_QPtestmaxloc_works1d(%arg0: !fir.ref> {fir.bindc_ // CHECK: %[[MASK_IF_ITEM:.*]] = fir.convert %[[MASK_ITEMVAL]] : (!fir.logical<4>) -> i1 // CHECK: %[[IF_MASK:.*]] = fir.if %[[MASK_IF_ITEM]] -> (i32) { // CHECK: %[[FLAG_SET2:.*]] = arith.constant 1 : i32 -// CHECK: fir.store %[[FLAG_SET2]] to %[[FLAG_ALLOC]] : !fir.ref +// CHECK: %[[ISFIRST:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[INARR_ITEM:.*]] = fir.coordinate_of %[[BOX_INARR]], %[[ITER]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[INARR_ITEMVAL:.*]] = fir.load %[[INARR_ITEM]] : !fir.ref // CHECK: %[[NEW_MIN:.*]] = arith.cmpi sgt, %[[INARR_ITEMVAL]], %[[MIN]] : i32 -// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[NEW_MIN]] -> (i32) { +// CHECK: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN]], %[[ISFIRSTNOT]] : i1 +// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[ORCOND]] -> (i32) { +// CHECK: fir.store %[[FLAG_SET2]] to %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[ONE:.*]] = arith.constant 1 : i32 // CHECK: %[[OUTARR_IDX:.*]] = arith.constant 0 : index // CHECK: %[[OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref @@ -2465,17 +2451,6 @@ func.func @_QPtestmaxloc_works1d(%arg0: !fir.ref> {fir.bindc_ // CHECK: } // CHECK: fir.result %[[IF_MASK:.*]] : i32 // CHECK: } -// CHECK: %[[FLAG_VAL:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref -// CHECK: %[[FLAG_WAS_SET:.*]] = arith.cmpi eq, %[[FLAG_VAL]], %[[FLAG_SET]] : i32 -// CHECK: fir.if %[[FLAG_WAS_SET]] { -// CHECK: %[[TEST_MAX:.*]] = arith.constant -2147483648 : i32 -// CHECK: %[[INIT_NOT_CHANGED:.*]] = arith.cmpi eq, %[[TEST_MAX]], %[[DO_LOOP:.*]] : i32 -// CHECK: fir.if %[[INIT_NOT_CHANGED]] { -// CHECK: %[[FLAG_OUTARR_IDX:.*]] = arith.constant 0 : index -// CHECK: %[[FLAG_OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[FLAG_OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref -// CHECK: fir.store %[[FLAG_SET]] to %[[FLAG_OUTARR_ITEM]] : !fir.ref -// CHECK: } -// CHECK: } // CHECK: %[[REF_BOX_OUTARR:.*]] = fir.convert %[[REF_BOX_OUTARR_NONE]] : (!fir.ref>) -> !fir.ref>>> // CHECK: fir.store %[[BOX_OUTARR]] to %[[REF_BOX_OUTARR]] : !fir.ref>>> // CHECK: return @@ -2557,16 +2532,25 @@ func.func @_QPtestmaxloc_works1d_scalarmask_f64(%arg0: !fir.ref -// CHECK: %[[MAX:.*]] = arith.constant -1.7976931348623157E+308 : f64 +// CHECK: %[[MAX:.*]] = arith.constant 0xFFF0000000000000 : f64 // CHECK: %[[C_INDEX1:.*]] = arith.constant 1 : index // CHECK: %[[DIM_INDEX:.*]] = arith.constant 0 : index // CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[BOX_INARR]], %[[DIM_INDEX]] : (!fir.box>, index) -> (index, index, index) // CHECK: %[[EXTENT:.*]] = arith.subi %[[DIMS]]#1, %[[C_INDEX1]] : index // CHECK: %[[DOLOOP:.*]] = fir.do_loop %[[ITER:.*]] = %[[C_INDEX0]] to %[[EXTENT]] step %[[C_INDEX1]] iter_args(%[[MIN:.*]] = %[[MAX]]) -> (f64) { +// CHECK: %[[FLAG_SET2:.*]] = arith.constant 1 : i32 +// CHECK: %[[ISFIRST:.*]] = fir.load %[[FLAG_ALLOC]] : !fir.ref // CHECK: %[[INARR_ITEM:.*]] = fir.coordinate_of %[[BOX_INARR]], %[[ITER]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[INARR_ITEMVAL:.*]] = fir.load %[[INARR_ITEM]] : !fir.ref -// CHECK: %[[NEW_MIN:.*]] = arith.cmpf ogt, %[[INARR_ITEMVAL]], %[[MIN]] fastmath<{{.*}}> : f64 -// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[NEW_MIN]] -> (f64) { +// CHECK: %[[NEW_MIN:.*]] = arith.cmpf ogt, %[[INARR_ITEMVAL]], %arg4 fastmath : f64 +// CHECK: %[[CONDRED:.*]] = arith.cmpf une, %arg4, %arg4 fastmath : f64 +// CHECK: %[[CONDELEM:.*]] = arith.cmpf oeq, %[[INARR_ITEMVAL]], %[[INARR_ITEMVAL]] fastmath : f64 +// CHECK: %[[ANDCOND:.*]] = arith.andi %[[CONDRED]], %[[CONDELEM]] : i1 +// CHECK: %[[NEW_MIN2:.*]] = arith.ori %[[NEW_MIN]], %[[ANDCOND]] : i1 +// CHECK: %[[ISFIRSTL:.*]] = fir.convert %[[ISFIRST]] : (i32) -> i1 +// CHECK: %[[ISFIRSTNOT:.*]] = arith.xori %[[ISFIRSTL]], %true : i1 +// CHECK: %[[ORCOND:.*]] = arith.ori %[[NEW_MIN2]], %[[ISFIRSTNOT]] : i1 +// CHECK: %[[IF_NEW_MIN:.*]] = fir.if %[[ORCOND]] -> (f64) { // CHECK: %[[ONE:.*]] = arith.constant 1 : i32 // CHECK: %[[OUTARR_IDX:.*]] = arith.constant 0 : index // CHECK: %[[OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref @@ -2580,18 +2564,6 @@ func.func @_QPtestmaxloc_works1d_scalarmask_f64(%arg0: !fir.ref -// CHECK: %[[FLAG_WAS_SET:.*]] = arith.cmpi eq, %[[FLAG_VAL]], %[[FLAG_CHECK]] : i32 -// CHECK: fir.if %[[FLAG_WAS_SET]] { -// CHECK: %[[TEST_MAX:.*]] = arith.constant -1.7976931348623157E+308 : f64 -// CHECK: %[[INIT_NOT_CHANGED:.*]] = arith.cmpf oeq, %[[TEST_MAX]], %[[INIT_RES:.*]] fastmath<{{.*}}> : f64 -// CHECK: fir.if %[[INIT_NOT_CHANGED]] { -// CHECK: %[[FLAG_OUTARR_IDX:.*]] = arith.constant 0 : index -// CHECK: %[[FLAG_OUTARR_ITEM:.*]] = fir.coordinate_of %[[BOX_OUTARR]], %[[FLAG_OUTARR_IDX]] : (!fir.box>>, index) -> !fir.ref -// CHECK: fir.store %[[FLAG_CHECK]] to %[[FLAG_OUTARR_ITEM]] : !fir.ref -// CHECK: } -// CHECK: } // CHECK: %[[REF_BOX_OUTARR:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref>>> // CHECK: fir.store %[[BOX_OUTARR]] to %[[REF_BOX_OUTARR]] : !fir.ref>>> // CHECK: return From 5db49f726619b943d8201ef3867393923836cb2f Mon Sep 17 00:00:00 2001 From: Nick Anderson Date: Wed, 21 Feb 2024 01:41:59 -0800 Subject: [PATCH 071/351] [GlobalISel] replace right identity X * -1.0 with fneg(x) (#80526) follow up patch to #78673 @Pierre-vh @jayfoad @arsenm Could you review when you have a chance. --- .../include/llvm/Target/GlobalISel/Combine.td | 10 +- .../GlobalISel/combine-fpneg-one-fneg.mir | 216 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 160 ++----------- llvm/test/CodeGen/AMDGPU/rsq.f64.ll | 20 +- 4 files changed, 253 insertions(+), 153 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fpneg-one-fneg.mir diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 7eadb718f1641..17757ca3e4111 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -493,6 +493,13 @@ def right_identity_one_fp: GICombineRule< (apply (GIReplaceReg $dst, $x)) >; +def right_identity_neg_one_fp: GICombineRule< + (defs root:$dst), + (match (G_FMUL $dst, $x, $y):$root, + [{ return Helper.matchConstantFPOp(${y}, -1.0); }]), + (apply (G_FNEG $dst, $x)) +>; + def right_identity_one : GICombineGroup<[right_identity_one_int, right_identity_one_fp]>; // Fold (x op x) - > x @@ -1283,7 +1290,8 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, trunc_buildvector_fold, trunc_lshr_buildvector_fold, bitcast_bitcast_fold, fptrunc_fpext_fold, - right_identity_neg_zero_fp]>; + right_identity_neg_zero_fp, + right_identity_neg_one_fp]>; def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p, overlapping_and, mulo_by_2, mulo_by_0, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fpneg-one-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fpneg-one-fneg.mir new file mode 100644 index 0000000000000..8ec2778992e23 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fpneg-one-fneg.mir @@ -0,0 +1,216 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK + +--- +name: test_neg_one_f16_sgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_neg_one_f16_sgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; CHECK-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: %d:_(s16) = G_FNEG %x + ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %d(s16) + ; CHECK-NEXT: $sgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $sgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %y:_(s16) = G_FCONSTANT half -1.0 + %d:_(s16) = G_FMUL %x, %y + %ext:_(s32) = G_ANYEXT %d:_(s16) + $sgpr0 = COPY %ext + +... + +--- +name: test_neg_one_f32_sgpr +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_neg_one_f32_sgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: $sgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_FCONSTANT float -1.0 + %2:_(s32) = G_FMUL %0, %1 + $sgpr0 = COPY %2(s32) + +... + +--- +name: test_neg_one_f64_sgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_neg_one_f64_sgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; CHECK-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32) + ; CHECK-NEXT: %d:_(s64) = G_FNEG %x + ; CHECK-NEXT: %ext:_(s32) = G_TRUNC %d(s64) + ; CHECK-NEXT: $sgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $sgpr0 + %x:_(s64) = G_ANYEXT %0:_(s32) + %y:_(s64) = G_FCONSTANT double -1.0 + %d:_(s64) = G_FMUL %x, %y + %ext:_(s32) = G_TRUNC %d:_(s64) + $sgpr0 = COPY %ext + +... + +--- +name: test_neg_ten_f32_sgpr +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_neg_ten_f32_sgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.000000e+01 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] + ; CHECK-NEXT: $sgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_FCONSTANT float -10.0 + %2:_(s32) = G_FMUL %0, %1 + $sgpr0 = COPY %2(s32) + +... + +--- +name: test_neg_fract_f32_sgpr +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: test_neg_fract_f32_sgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -5.000000e-01 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] + ; CHECK-NEXT: $sgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_FCONSTANT float -0.5 + %2:_(s32) = G_FMUL %0, %1 + $sgpr0 = COPY %2(s32) + +... + +--- +name: test_neg_one_f16_vgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_neg_one_f16_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: %d:_(s16) = G_FNEG %x + ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %d(s16) + ; CHECK-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %y:_(s16) = G_FCONSTANT half -1.0 + %d:_(s16) = G_FMUL %x, %y + %ext:_(s32) = G_ANYEXT %d:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: test_neg_one_f32_vgpr +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_neg_one_f32_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] + ; CHECK-NEXT: $vgpr0 = COPY [[FNEG]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCONSTANT float -1.0 + %2:_(s32) = G_FMUL %0, %1 + $vgpr0 = COPY %2(s32) + +... + +--- +name: test_neg_one_f64_vgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_neg_one_f64_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32) + ; CHECK-NEXT: %d:_(s64) = G_FNEG %x + ; CHECK-NEXT: %ext:_(s32) = G_TRUNC %d(s64) + ; CHECK-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s64) = G_ANYEXT %0:_(s32) + %y:_(s64) = G_FCONSTANT double -1.0 + %d:_(s64) = G_FMUL %x, %y + %ext:_(s32) = G_TRUNC %d:_(s64) + $vgpr0 = COPY %ext + +... + +--- +name: test_neg_ten_f32_vgpr +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_neg_ten_f32_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.000000e+01 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCONSTANT float -10.0 + %2:_(s32) = G_FMUL %0, %1 + $vgpr0 = COPY %2(s32) + +... + +--- +name: test_neg_fract_f32_vgpr +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_neg_fract_f32_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -5.000000e-01 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[C]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCONSTANT float -0.5 + %2:_(s32) = G_FMUL %0, %1 + $vgpr0 = COPY %2(s32) + +... diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 3e658c6f38532..711a5fff1a063 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -218,41 +218,11 @@ define float @v_mul_neg2_f32(float %x) { } define float @v_mul_neg1_f32(float %x) { -; GFX9-SDAG-LABEL: v_mul_neg1_f32: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_neg1_f32: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_neg1_f32: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_neg1_f32: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_neg1_f32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_mul_neg1_f32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_mul_neg1_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %mul = fmul float %x, -1.0 ret float %mul } @@ -1356,41 +1326,11 @@ define double @v_mul_0_f64(double %x) { } define double @v_mul_neg1_f64(double %x) { -; GFX9-SDAG-LABEL: v_mul_neg1_f64: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_neg1_f64: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], -1.0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_neg1_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_neg1_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], -1.0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_neg1_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_mul_neg1_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], -1.0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_mul_neg1_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %mul = fmul double %x, -1.0 ret double %mul } @@ -2848,41 +2788,11 @@ define half @v_mul_neg2_f16(half %x) { } define half @v_mul_neg1_f16(half %x) { -; GFX9-SDAG-LABEL: v_mul_neg1_f16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_neg1_f16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, -1.0, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_neg1_f16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_neg1_f16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, -1.0, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_neg1_f16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_mul_neg1_f16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, -1.0, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_mul_neg1_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %mul = fmul half %x, -1.0 ret half %mul } @@ -7112,41 +7022,11 @@ define double @v_mul_fabs_neg2_f64(double %x) { } define double @v_mul_fabs_neg1_f64(double %x) { -; GFX9-SDAG-LABEL: v_mul_fabs_neg1_f64: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_or_b32_e32 v1, 0x80000000, v1 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_fabs_neg1_f64: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, -1.0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_fabs_neg1_f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_or_b32_e32 v1, 0x80000000, v1 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_fabs_neg1_f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, -1.0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_fabs_neg1_f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_or_b32_e32 v1, 0x80000000, v1 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_mul_fabs_neg1_f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, -1.0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_mul_fabs_neg1_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] %fabs.x = call double @llvm.fabs.f64(double %x) %mul = fmul double %fabs.x, -1.0 ret double %mul diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 90175298a99ac..bd6e1f54e636d 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -3431,9 +3431,8 @@ define double @v_neg_rsq_f64__afn(double %x) { ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_rsq_f64__afn: @@ -3503,9 +3502,8 @@ define double @v_neg_rsq_f64__afn(double %x) { ; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3] ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn double -1.0, %sqrt @@ -4015,9 +4013,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; SI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf: @@ -4087,9 +4084,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { ; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] ; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; VI-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[2:3] ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x) %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt From 91f11611337dde9a8e0a5e19240f6bb4671922c6 Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Wed, 21 Feb 2024 11:01:00 +0100 Subject: [PATCH 072/351] [mlir] expose transform interpreter to Python (#82365) Transform interpreter functionality can be used standalone without going through the interpreter pass, make it available in Python. --- .../mlir-c/Dialect/Transform/Interpreter.h | 77 ++++++++++++++++ .../mlir/Bindings/Python/PybindAdaptors.h | 36 ++++++++ mlir/lib/Bindings/Python/DialectLLVM.cpp | 31 ------- mlir/lib/Bindings/Python/IRCore.cpp | 7 ++ mlir/lib/Bindings/Python/IRModule.h | 1 + .../Bindings/Python/TransformInterpreter.cpp | 90 +++++++++++++++++++ mlir/lib/CAPI/Dialect/CMakeLists.txt | 9 ++ .../lib/CAPI/Dialect/TransformInterpreter.cpp | 74 +++++++++++++++ mlir/python/CMakeLists.txt | 19 ++++ .../transform/interpreter/__init__.py | 33 +++++++ mlir/test/CAPI/CMakeLists.txt | 9 ++ mlir/test/CAPI/transform_interpreter.c | 69 ++++++++++++++ mlir/test/CMakeLists.txt | 1 + mlir/test/lit.cfg.py | 1 + .../python/dialects/transform_interpreter.py | 56 ++++++++++++ .../llvm-project-overlay/mlir/BUILD.bazel | 19 ++++ .../mlir/python/BUILD.bazel | 5 ++ 17 files changed, 506 insertions(+), 31 deletions(-) create mode 100644 mlir/include/mlir-c/Dialect/Transform/Interpreter.h create mode 100644 mlir/lib/Bindings/Python/TransformInterpreter.cpp create mode 100644 mlir/lib/CAPI/Dialect/TransformInterpreter.cpp create mode 100644 mlir/python/mlir/dialects/transform/interpreter/__init__.py create mode 100644 mlir/test/CAPI/transform_interpreter.c create mode 100644 mlir/test/python/dialects/transform_interpreter.py diff --git a/mlir/include/mlir-c/Dialect/Transform/Interpreter.h b/mlir/include/mlir-c/Dialect/Transform/Interpreter.h new file mode 100644 index 0000000000000..00095d5040a0e --- /dev/null +++ b/mlir/include/mlir-c/Dialect/Transform/Interpreter.h @@ -0,0 +1,77 @@ +//===-- mlir-c/Dialect/Transform/Interpreter.h --------------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// C interface to the transform dialect interpreter. +// +//===----------------------------------------------------------------------===// + +#include "mlir-c/IR.h" +#include "mlir-c/Support.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEFINE_C_API_STRUCT(name, storage) \ + struct name { \ + storage *ptr; \ + }; \ + typedef struct name name + +DEFINE_C_API_STRUCT(MlirTransformOptions, void); + +#undef DEFINE_C_API_STRUCT + +//----------------------------------------------------------------------------// +// MlirTransformOptions +//----------------------------------------------------------------------------// + +/// Creates a default-initialized transform options object. +MLIR_CAPI_EXPORTED MlirTransformOptions mlirTransformOptionsCreate(void); + +/// Enables or disables expensive checks in transform options. +MLIR_CAPI_EXPORTED void +mlirTransformOptionsEnableExpensiveChecks(MlirTransformOptions transformOptions, + bool enable); + +/// Returns true if expensive checks are enabled in transform options. +MLIR_CAPI_EXPORTED bool mlirTransformOptionsGetExpensiveChecksEnabled( + MlirTransformOptions transformOptions); + +/// Enables or disables the enforcement of the top-level transform op being +/// single in transform options. +MLIR_CAPI_EXPORTED void mlirTransformOptionsEnforceSingleTopLevelTransformOp( + MlirTransformOptions transformOptions, bool enable); + +/// Returns true if the enforcement of the top-level transform op being single +/// is enabled in transform options. +MLIR_CAPI_EXPORTED bool mlirTransformOptionsGetEnforceSingleTopLevelTransformOp( + MlirTransformOptions transformOptions); + +/// Destroys a transform options object previously created by +/// mlirTransformOptionsCreate. +MLIR_CAPI_EXPORTED void +mlirTransformOptionsDestroy(MlirTransformOptions transformOptions); + +//----------------------------------------------------------------------------// +// Transform interpreter. +//----------------------------------------------------------------------------// + +/// Applies the transformation script starting at the given transform root +/// operation to the given payload operation. The module containing the +/// transform root as well as the transform options should be provided. The +/// transform operation must implement TransformOpInterface and the module must +/// be a ModuleOp. Returns the status of the application. +MLIR_CAPI_EXPORTED MlirLogicalResult mlirTransformApplyNamedSequence( + MlirOperation payload, MlirOperation transformRoot, + MlirOperation transformModule, MlirTransformOptions transformOptions); + +#ifdef __cplusplus +} +#endif diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h index 66cf20e1c136f..52f6321251919 100644 --- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h +++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h @@ -23,6 +23,7 @@ #include #include "mlir-c/Bindings/Python/Interop.h" +#include "mlir-c/Diagnostics.h" #include "mlir-c/IR.h" #include "llvm/ADT/Twine.h" @@ -569,6 +570,41 @@ class mlir_value_subclass : public pure_subclass { }; } // namespace adaptors + +/// RAII scope intercepting all diagnostics into a string. The message must be +/// checked before this goes out of scope. +class CollectDiagnosticsToStringScope { +public: + explicit CollectDiagnosticsToStringScope(MlirContext ctx) : context(ctx) { + handlerID = mlirContextAttachDiagnosticHandler(ctx, &handler, &errorMessage, + /*deleteUserData=*/nullptr); + } + ~CollectDiagnosticsToStringScope() { + assert(errorMessage.empty() && "unchecked error message"); + mlirContextDetachDiagnosticHandler(context, handlerID); + } + + [[nodiscard]] std::string takeMessage() { return std::move(errorMessage); } + +private: + static MlirLogicalResult handler(MlirDiagnostic diag, void *data) { + auto printer = +[](MlirStringRef message, void *data) { + *static_cast(data) += + llvm::StringRef(message.data, message.length); + }; + MlirLocation loc = mlirDiagnosticGetLocation(diag); + *static_cast(data) += "at "; + mlirLocationPrint(loc, printer, data); + *static_cast(data) += ": "; + mlirDiagnosticPrint(diag, printer, data); + return mlirLogicalResultSuccess(); + } + + MlirContext context; + MlirDiagnosticHandlerID handlerID; + std::string errorMessage = ""; +}; + } // namespace python } // namespace mlir diff --git a/mlir/lib/Bindings/Python/DialectLLVM.cpp b/mlir/lib/Bindings/Python/DialectLLVM.cpp index 780f5eacf0b8e..843707751dd84 100644 --- a/mlir/lib/Bindings/Python/DialectLLVM.cpp +++ b/mlir/lib/Bindings/Python/DialectLLVM.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "mlir-c/Diagnostics.h" #include "mlir-c/Dialect/LLVM.h" #include "mlir-c/IR.h" #include "mlir-c/Support.h" @@ -19,36 +18,6 @@ using namespace mlir; using namespace mlir::python; using namespace mlir::python::adaptors; -/// RAII scope intercepting all diagnostics into a string. The message must be -/// checked before this goes out of scope. -class CollectDiagnosticsToStringScope { -public: - explicit CollectDiagnosticsToStringScope(MlirContext ctx) : context(ctx) { - handlerID = mlirContextAttachDiagnosticHandler(ctx, &handler, &errorMessage, - /*deleteUserData=*/nullptr); - } - ~CollectDiagnosticsToStringScope() { - assert(errorMessage.empty() && "unchecked error message"); - mlirContextDetachDiagnosticHandler(context, handlerID); - } - - [[nodiscard]] std::string takeMessage() { return std::move(errorMessage); } - -private: - static MlirLogicalResult handler(MlirDiagnostic diag, void *data) { - auto printer = +[](MlirStringRef message, void *data) { - *static_cast(data) += - StringRef(message.data, message.length); - }; - mlirDiagnosticPrint(diag, printer, data); - return mlirLogicalResultSuccess(); - } - - MlirContext context; - MlirDiagnosticHandlerID handlerID; - std::string errorMessage = ""; -}; - void populateDialectLLVMSubmodule(const pybind11::module &m) { auto llvmStructType = mlir_type_subclass(m, "StructType", mlirTypeIsALLVMStructType); diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 8a7951dc29fe5..734f2f7f3f94c 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -678,6 +678,10 @@ void PyMlirContext::clearOperationsInside(PyOperationBase &op) { mlirOperationWalk(op.getOperation(), invalidatingCallback, static_cast(&data), MlirWalkPreOrder); } +void PyMlirContext::clearOperationsInside(MlirOperation op) { + PyOperationRef opRef = PyOperation::forOperation(getRef(), op); + clearOperationsInside(opRef->getOperation()); +} size_t PyMlirContext::getLiveModuleCount() { return liveModules.size(); } @@ -2556,6 +2560,9 @@ void mlir::python::populateIRCore(py::module &m) { .def("_get_live_operation_objects", &PyMlirContext::getLiveOperationObjects) .def("_clear_live_operations", &PyMlirContext::clearLiveOperations) + .def("_clear_live_operations_inside", + py::overload_cast( + &PyMlirContext::clearOperationsInside)) .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount) .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule) diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 48f39c939340d..9acfdde25ae04 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -223,6 +223,7 @@ class PyMlirContext { /// Clears all operations nested inside the given op using /// `clearOperation(MlirOperation)`. void clearOperationsInside(PyOperationBase &op); + void clearOperationsInside(MlirOperation op); /// Gets the count of live modules associated with this context. /// Used for testing. diff --git a/mlir/lib/Bindings/Python/TransformInterpreter.cpp b/mlir/lib/Bindings/Python/TransformInterpreter.cpp new file mode 100644 index 0000000000000..6517f8c39dfad --- /dev/null +++ b/mlir/lib/Bindings/Python/TransformInterpreter.cpp @@ -0,0 +1,90 @@ +//===- TransformInterpreter.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Pybind classes for the transform dialect interpreter. +// +//===----------------------------------------------------------------------===// + +#include "mlir-c/Dialect/Transform/Interpreter.h" +#include "mlir-c/IR.h" +#include "mlir-c/Support.h" +#include "mlir/Bindings/Python/PybindAdaptors.h" + +#include +#include + +namespace py = pybind11; + +namespace { +struct PyMlirTransformOptions { + PyMlirTransformOptions() { options = mlirTransformOptionsCreate(); }; + PyMlirTransformOptions(PyMlirTransformOptions &&other) { + options = other.options; + other.options.ptr = nullptr; + } + PyMlirTransformOptions(const PyMlirTransformOptions &) = delete; + + ~PyMlirTransformOptions() { mlirTransformOptionsDestroy(options); } + + MlirTransformOptions options; +}; +} // namespace + +static void populateTransformInterpreterSubmodule(py::module &m) { + py::class_(m, "TransformOptions", py::module_local()) + .def(py::init()) + .def_property( + "expensive_checks", + [](const PyMlirTransformOptions &self) { + return mlirTransformOptionsGetExpensiveChecksEnabled(self.options); + }, + [](PyMlirTransformOptions &self, bool value) { + mlirTransformOptionsEnableExpensiveChecks(self.options, value); + }) + .def_property( + "enforce_single_top_level_transform_op", + [](const PyMlirTransformOptions &self) { + return mlirTransformOptionsGetEnforceSingleTopLevelTransformOp( + self.options); + }, + [](PyMlirTransformOptions &self, bool value) { + mlirTransformOptionsEnforceSingleTopLevelTransformOp(self.options, + value); + }); + + m.def( + "apply_named_sequence", + [](MlirOperation payloadRoot, MlirOperation transformRoot, + MlirOperation transformModule, const PyMlirTransformOptions &options) { + mlir::python::CollectDiagnosticsToStringScope scope( + mlirOperationGetContext(transformRoot)); + + // Calling back into Python to invalidate everything under the payload + // root. This is awkward, but we don't have access to PyMlirContext + // object here otherwise. + py::object obj = py::cast(payloadRoot); + obj.attr("context").attr("_clear_live_operations_inside")(payloadRoot); + + MlirLogicalResult result = mlirTransformApplyNamedSequence( + payloadRoot, transformRoot, transformModule, options.options); + if (mlirLogicalResultIsSuccess(result)) + return; + + throw py::value_error( + "Failed to apply named transform sequence.\nDiagnostic message " + + scope.takeMessage()); + }, + py::arg("payload_root"), py::arg("transform_root"), + py::arg("transform_module"), + py::arg("transform_options") = PyMlirTransformOptions()); +} + +PYBIND11_MODULE(_mlirTransformInterpreter, m) { + m.doc() = "MLIR Transform dialect interpreter functionality."; + populateTransformInterpreterSubmodule(m); +} diff --git a/mlir/lib/CAPI/Dialect/CMakeLists.txt b/mlir/lib/CAPI/Dialect/CMakeLists.txt index b2952da17a41c..58b8739043f9d 100644 --- a/mlir/lib/CAPI/Dialect/CMakeLists.txt +++ b/mlir/lib/CAPI/Dialect/CMakeLists.txt @@ -198,6 +198,15 @@ add_mlir_upstream_c_api_library(MLIRCAPITransformDialect MLIRTransformDialect ) +add_mlir_upstream_c_api_library(MLIRCAPITransformDialectTransforms + TransformInterpreter.cpp + + PARTIAL_SOURCES_INTENDED + LINK_LIBS PUBLIC + MLIRCAPIIR + MLIRTransformDialectTransforms +) + add_mlir_upstream_c_api_library(MLIRCAPIQuant Quant.cpp diff --git a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp new file mode 100644 index 0000000000000..6a2cfb235fcfd --- /dev/null +++ b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp @@ -0,0 +1,74 @@ +//===- TransformTransforms.cpp - C Interface for Transform dialect --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// C interface to transforms for the transform dialect. +// +//===----------------------------------------------------------------------===// + +#include "mlir-c/Dialect/Transform/Interpreter.h" +#include "mlir-c/Support.h" +#include "mlir/CAPI/IR.h" +#include "mlir/CAPI/Support.h" +#include "mlir/CAPI/Wrap.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" +#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h" + +using namespace mlir; + +DEFINE_C_API_PTR_METHODS(MlirTransformOptions, transform::TransformOptions) + +extern "C" { + +MlirTransformOptions mlirTransformOptionsCreate() { + return wrap(new transform::TransformOptions); +} + +void mlirTransformOptionsEnableExpensiveChecks( + MlirTransformOptions transformOptions, bool enable) { + unwrap(transformOptions)->enableExpensiveChecks(enable); +} + +bool mlirTransformOptionsGetExpensiveChecksEnabled( + MlirTransformOptions transformOptions) { + return unwrap(transformOptions)->getExpensiveChecksEnabled(); +} + +void mlirTransformOptionsEnforceSingleTopLevelTransformOp( + MlirTransformOptions transformOptions, bool enable) { + unwrap(transformOptions)->enableEnforceSingleToplevelTransformOp(enable); +} + +bool mlirTransformOptionsGetEnforceSingleTopLevelTransformOp( + MlirTransformOptions transformOptions) { + return unwrap(transformOptions)->getEnforceSingleToplevelTransformOp(); +} + +void mlirTransformOptionsDestroy(MlirTransformOptions transformOptions) { + delete unwrap(transformOptions); +} + +MlirLogicalResult mlirTransformApplyNamedSequence( + MlirOperation payload, MlirOperation transformRoot, + MlirOperation transformModule, MlirTransformOptions transformOptions) { + Operation *transformRootOp = unwrap(transformRoot); + Operation *transformModuleOp = unwrap(transformModule); + if (!isa(transformRootOp)) { + transformRootOp->emitError() + << "must implement TransformOpInterface to be used as transform root"; + return mlirLogicalResultFailure(); + } + if (!isa(transformModuleOp)) { + transformModuleOp->emitError() + << "must be a " << ModuleOp::getOperationName(); + return mlirLogicalResultFailure(); + } + return wrap(transform::applyTransformNamedSequence( + unwrap(payload), unwrap(transformRoot), + cast(unwrap(transformModule)), *unwrap(transformOptions))); +} +} diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt index ed167afeb69a6..563d035f15526 100644 --- a/mlir/python/CMakeLists.txt +++ b/mlir/python/CMakeLists.txt @@ -181,6 +181,13 @@ declare_mlir_python_sources( SOURCES dialects/transform/extras/__init__.py) +declare_mlir_python_sources( + MLIRPythonSources.Dialects.transform.interpreter + ADD_TO_PARENT MLIRPythonSources.Dialects + ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" + SOURCES + dialects/transform/interpreter/__init__.py) + declare_mlir_dialect_extension_python_bindings( ADD_TO_PARENT MLIRPythonSources.Dialects ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" @@ -609,6 +616,18 @@ declare_mlir_python_extension(MLIRPythonExtension.SparseTensorDialectPasses MLIRCAPISparseTensor ) +declare_mlir_python_extension(MLIRPythonExtension.TransformInterpreter + MODULE_NAME _mlirTransformInterpreter + ADD_TO_PARENT MLIRPythonSources.Dialects.transform + ROOT_DIR "${PYTHON_SOURCE_DIR}" + SOURCES + TransformInterpreter.cpp + PRIVATE_LINK_LIBS + LLVMSupport + EMBED_CAPI_LINK_LIBS + MLIRCAPITransformDialectTransforms +) + # TODO: Figure out how to put this in the test tree. # This should not be included in the main Python extension. However, # putting it into MLIRPythonTestSources along with the dialect declaration diff --git a/mlir/python/mlir/dialects/transform/interpreter/__init__.py b/mlir/python/mlir/dialects/transform/interpreter/__init__.py new file mode 100644 index 0000000000000..6145b99224eb5 --- /dev/null +++ b/mlir/python/mlir/dialects/transform/interpreter/__init__.py @@ -0,0 +1,33 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from ....ir import Operation +from ...._mlir_libs import _mlirTransformInterpreter as _cextTransformInterpreter + + +TransformOptions = _cextTransformInterpreter.TransformOptions + + +def _unpack_operation(op): + if isinstance(op, Operation): + return op + return op.operation + + +def apply_named_sequence( + payload_root, transform_root, transform_module, transform_options=None +): + """Applies the transformation script starting at the given transform root + operation to the given payload operation. The module containing the + transform root as well as the transform options should be provided. + The transform operation must implement TransformOpInterface and the module + must be a ModuleOp.""" + + args = tuple( + map(_unpack_operation, (payload_root, transform_root, transform_module)) + ) + if transform_options is None: + _cextTransformInterpreter.apply_named_sequence(*args) + else: + _cextTransformInterpreter(*args, transform_options) diff --git a/mlir/test/CAPI/CMakeLists.txt b/mlir/test/CAPI/CMakeLists.txt index 1096a3b080664..79b61fdef38b4 100644 --- a/mlir/test/CAPI/CMakeLists.txt +++ b/mlir/test/CAPI/CMakeLists.txt @@ -86,6 +86,15 @@ _add_capi_test_executable(mlir-capi-transform-test MLIRCAPITransformDialect ) +_add_capi_test_executable(mlir-capi-transform-interpreter-test + transform_interpreter.c + LINK_LIBS PRIVATE + MLIRCAPIIR + MLIRCAPIRegisterEverything + MLIRCAPITransformDialect + MLIRCAPITransformDialectTransforms +) + _add_capi_test_executable(mlir-capi-translation-test translation.c LINK_LIBS PRIVATE diff --git a/mlir/test/CAPI/transform_interpreter.c b/mlir/test/CAPI/transform_interpreter.c new file mode 100644 index 0000000000000..8fe37b47b7f87 --- /dev/null +++ b/mlir/test/CAPI/transform_interpreter.c @@ -0,0 +1,69 @@ +//===- transform_interpreter.c - Test of the Transform interpreter C API --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// RUN: mlir-capi-transform-interpreter-test 2>&1 | FileCheck %s + +#include "mlir-c/Dialect/Transform.h" +#include "mlir-c/Dialect/Transform/Interpreter.h" +#include "mlir-c/IR.h" +#include "mlir-c/Support.h" + +#include +#include + +int testApplyNamedSequence(MlirContext ctx) { + fprintf(stderr, "%s\n", __FUNCTION__); + + const char module[] = + "module attributes {transform.with_named_sequence} {" + " transform.named_sequence @__transform_main(%root: !transform.any_op) {" + " transform.print %root { name = \"from interpreter\" }: " + "!transform.any_op" + " transform.yield" + " }" + "}"; + + MlirStringRef moduleStringRef = mlirStringRefCreateFromCString(module); + MlirStringRef nameStringRef = mlirStringRefCreateFromCString("inline-module"); + + MlirOperation root = + mlirOperationCreateParse(ctx, moduleStringRef, nameStringRef); + if (mlirOperationIsNull(root)) + return 1; + MlirBlock body = mlirRegionGetFirstBlock(mlirOperationGetRegion(root, 0)); + MlirOperation entry = mlirBlockGetFirstOperation(body); + + MlirTransformOptions options = mlirTransformOptionsCreate(); + mlirTransformOptionsEnableExpensiveChecks(options, true); + mlirTransformOptionsEnforceSingleTopLevelTransformOp(options, true); + + MlirLogicalResult result = + mlirTransformApplyNamedSequence(root, entry, root, options); + mlirTransformOptionsDestroy(options); + if (mlirLogicalResultIsFailure(result)) + return 2; + + return 0; +} +// CHECK-LABEL: testApplyNamedSequence +// CHECK: from interpreter +// CHECK: transform.named_sequence @__transform_main +// CHECK: transform.print %arg0 +// CHECK: transform.yield + +int main(void) { + MlirContext ctx = mlirContextCreate(); + mlirDialectHandleRegisterDialect(mlirGetDialectHandle__transform__(), ctx); + int result = testApplyNamedSequence(ctx); + mlirContextDestroy(ctx); + if (result) + return result; + + return EXIT_SUCCESS; +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 6724dd4bdd1bc..74921544c5557 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -100,6 +100,7 @@ set(MLIR_TEST_DEPENDS mlir-capi-quant-test mlir-capi-sparse-tensor-test mlir-capi-transform-test + mlir-capi-transform-interpreter-test mlir-capi-translation-test mlir-linalg-ods-yaml-gen mlir-lsp-server diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 38e65e4549c55..904dfb680a040 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -106,6 +106,7 @@ def add_runtime(name): "mlir-capi-quant-test", "mlir-capi-sparse-tensor-test", "mlir-capi-transform-test", + "mlir-capi-transform-interpreter-test", "mlir-capi-translation-test", "mlir-cpu-runner", add_runtime("mlir_runner_utils"), diff --git a/mlir/test/python/dialects/transform_interpreter.py b/mlir/test/python/dialects/transform_interpreter.py new file mode 100644 index 0000000000000..740c49f76a26c --- /dev/null +++ b/mlir/test/python/dialects/transform_interpreter.py @@ -0,0 +1,56 @@ +# RUN: %PYTHON %s | FileCheck %s + +from mlir import ir +from mlir.dialects.transform import interpreter as interp + + +def test_in_context(f): + with ir.Context(), ir.Location.unknown(): + f() + return f + + +print_root_module = """ +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root: !transform.any_op) { + transform.print %root { name = \"from interpreter\" }: !transform.any_op + transform.yield + } +}""" + + +@test_in_context +def print_self(): + m = ir.Module.parse(print_root_module.replace("from interpreter", "print_self")) + interp.apply_named_sequence(m, m.body.operations[0], m) + + +# CHECK-LABEL: print_self +# CHECK: transform.named_sequence @__transform_main +# CHECK: transform.print +# CHECK: transform.yield + + +@test_in_context +def print_other(): + transform = ir.Module.parse( + print_root_module.replace("from interpreter", "print_other") + ) + payload = ir.Module.parse("module attributes { this.is.payload } {}") + interp.apply_named_sequence(payload, transform.body.operations[0], transform) + + +# CHECK-LABEL: print_other +# CHECK-NOT: transform +# CHECK: this.is.payload + + +@test_in_context +def failed(): + payload = ir.Module.parse("module attributes { this.is.payload } {}") + try: + interp.apply_named_sequence(payload, payload, payload) + except ValueError as e: + assert ( + "must implement TransformOpInterface to be used as transform root" in str(e) + ) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index a21bc01aa1e3c..bb7a34ef76772 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -739,6 +739,25 @@ mlir_c_api_cc_library( ], ) +mlir_c_api_cc_library( + name = "CAPITransformDialectTransforms", + srcs = [ + "lib/CAPI/Dialect/TransformInterpreter.cpp", + ], + hdrs = [ + "include/mlir-c/Dialect/Transform/Interpreter.h", + ], + capi_deps = [ + ":CAPIIR", + ":CAPITransformDialect", + ], + includes = ["include"], + deps = [ + ":TransformDialect", + ":TransformDialectTransforms", + ], +) + mlir_c_api_cc_library( name = "CAPIMLProgram", srcs = [ diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel index f19c2336e6bcb..0c3ed22e73601 100644 --- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel @@ -1483,6 +1483,11 @@ filegroup( srcs = glob(["mlir/dialects/transform/extras/*.py"]), ) +filegroup( + name = "TransformInterpreterPackagePyFiles", + srcs = glob(["mlir/dialects/transform/interpreter/*.py"]), +) + ##---------------------------------------------------------------------------## # Vector dialect. ##---------------------------------------------------------------------------## From 48101edc8d57364d9c9f9e2829f0d4e975c0ade5 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Wed, 21 Feb 2024 10:05:50 +0000 Subject: [PATCH 073/351] [AArch64] Fix syntax of gcsstr and gcssttr instructions (#82385) The address register should be surrounded by square brackets, like in all the other str instructions. Fixes https://github.com/llvm/llvm-project/issues/81846 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +- llvm/test/MC/AArch64/armv9.4a-gcs.s | 16 ++++++++-------- .../MC/Disassembler/AArch64/armv9.4a-gcs.txt | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 8c2a852850320..8e73f57ced42b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1243,7 +1243,7 @@ def : InstAlias<"chkfeat\tx16", (CHKFEAT), 0>; def : InstAlias<"chkfeat\tx16", (CHKFEAT), 1>, Requires<[HasCHK]>; class GCSSt op> - : I<(outs), (ins GPR64:$Rt, GPR64sp:$Rn), mnemonic, "\t$Rt, $Rn", "", []>, Sched<[]> { + : I<(outs), (ins GPR64:$Rt, GPR64sp:$Rn), mnemonic, "\t$Rt, [$Rn]", "", []>, Sched<[]> { bits<5> Rt; bits<5> Rn; let Inst{31-15} = 0b11011001000111110; diff --git a/llvm/test/MC/AArch64/armv9.4a-gcs.s b/llvm/test/MC/AArch64/armv9.4a-gcs.s index 8910229b8dace..b4af9b5dcb10c 100644 --- a/llvm/test/MC/AArch64/armv9.4a-gcs.s +++ b/llvm/test/MC/AArch64/armv9.4a-gcs.s @@ -86,20 +86,20 @@ hint #19 // ERROR-NO-GCS-NOT: [[@LINE-2]]:1: error: instruction requires: gcs // NO-GCS: hint #19 // encoding: [0x7f,0x22,0x03,0xd5] -gcsstr x26, x27 -// CHECK: gcsstr x26, x27 // encoding: [0x7a,0x0f,0x1f,0xd9] +gcsstr x26, [x27] +// CHECK: gcsstr x26, [x27] // encoding: [0x7a,0x0f,0x1f,0xd9] // ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs -gcsstr x26, sp -// CHECK: gcsstr x26, sp // encoding: [0xfa,0x0f,0x1f,0xd9] +gcsstr x26, [sp] +// CHECK: gcsstr x26, [sp] // encoding: [0xfa,0x0f,0x1f,0xd9] // ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs -gcssttr x26, x27 -// CHECK: gcssttr x26, x27 // encoding: [0x7a,0x1f,0x1f,0xd9] +gcssttr x26, [x27] +// CHECK: gcssttr x26, [x27] // encoding: [0x7a,0x1f,0x1f,0xd9] // ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs -gcssttr x26, sp -// CHECK: gcssttr x26, sp // encoding: [0xfa,0x1f,0x1f,0xd9] +gcssttr x26, [sp] +// CHECK: gcssttr x26, [sp] // encoding: [0xfa,0x1f,0x1f,0xd9] // ERROR-NO-GCS: [[@LINE-2]]:1: error: instruction requires: gcs gcspushx diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt b/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt index 7e2802b263858..512f4027d9761 100644 --- a/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt +++ b/llvm/test/MC/Disassembler/AArch64/armv9.4a-gcs.txt @@ -69,16 +69,16 @@ // CHECK: gcsb dsync [0x7a,0x0f,0x1f,0xd9] -// CHECK: gcsstr x26, x27 +// CHECK: gcsstr x26, [x27] [0xfa,0x0f,0x1f,0xd9] -// CHECK: gcsstr x26, sp +// CHECK: gcsstr x26, [sp] [0x7a,0x1f,0x1f,0xd9] -// CHECK: gcssttr x26, x27 +// CHECK: gcssttr x26, [x27] [0xfa,0x1f,0x1f,0xd9] -// CHECK: gcssttr x26, sp +// CHECK: gcssttr x26, [sp] [0x9f,0x77,0x08,0xd5] // CHECK: gcspushx From 35593f6613445fe4a8daa6e7589deec82fcd4d2b Mon Sep 17 00:00:00 2001 From: Sergei Lebedev <185856+superbobry@users.noreply.github.com> Date: Wed, 21 Feb 2024 10:06:29 +0000 Subject: [PATCH 074/351] [MLIR][Python] Use isinstance() instead of issubclass(type(...), ...) (#82345) The two forms are equivalent, so there is no reason to use the longer one. --- mlir/test/mlir-tblgen/op-python-bindings.td | 2 +- mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td index f7df8ba2df0ae..dbed1164f1eb0 100644 --- a/mlir/test/mlir-tblgen/op-python-bindings.td +++ b/mlir/test/mlir-tblgen/op-python-bindings.td @@ -123,7 +123,7 @@ def AttributedOp : TestOp<"attributed_op"> { // CHECK: attributes = {} // CHECK: regions = None // CHECK: attributes["i32attr"] = (i32attr if ( - // CHECK-NEXT: issubclass(type(i32attr), _ods_ir.Attribute) or + // CHECK-NEXT: isinstance(i32attr, _ods_ir.Attribute) or // CHECK-NEXT: not _ods_ir.AttrBuilder.contains('I32Attr') // CHECK-NEXT: _ods_ir.AttrBuilder.get('I32Attr')(i32attr, context=_ods_context) // CHECK: if optionalF32Attr is not None: attributes["optionalF32Attr"] = (optionalF32Attr diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp index 0770ed562309e..640360eff734a 100644 --- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp +++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp @@ -534,7 +534,7 @@ constexpr const char *multiResultAppendTemplate = "results.extend({0})"; /// there is no method registered to make it an Attribute. constexpr const char *initAttributeWithBuilderTemplate = R"Py(attributes["{1}"] = ({0} if ( - issubclass(type({0}), _ods_ir.Attribute) or + isinstance({0}, _ods_ir.Attribute) or not _ods_ir.AttrBuilder.contains('{2}')) else _ods_ir.AttrBuilder.get('{2}')({0}, context=_ods_context)))Py"; @@ -547,7 +547,7 @@ constexpr const char *initAttributeWithBuilderTemplate = /// there is no method registered to make it an Attribute. constexpr const char *initOptionalAttributeWithBuilderTemplate = R"Py(if {0} is not None: attributes["{1}"] = ({0} if ( - issubclass(type({0}), _ods_ir.Attribute) or + isinstance({0}, _ods_ir.Attribute) or not _ods_ir.AttrBuilder.contains('{2}')) else _ods_ir.AttrBuilder.get('{2}')({0}, context=_ods_context)))Py"; From 3533fe783df4b417f16077edb70099010d2d7eef Mon Sep 17 00:00:00 2001 From: Kadir Cetinkaya Date: Wed, 21 Feb 2024 11:15:42 +0100 Subject: [PATCH 075/351] Revert "[clang] Preserve found-decl when constructing VarTemplateIds (#82265)" This reverts commit 50373506d570f3db1e1af7c13d46409736452f3a. Broke include-cleaner tests --- clang/include/clang/Sema/Sema.h | 2 +- clang/lib/Sema/SemaTemplate.cpp | 18 ++++++++++-------- clang/test/AST/ast-dump-using.cpp | 7 ------- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 23e1a623a20d1..89215bf3d1c69 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -8538,7 +8538,7 @@ class Sema final { /// if the arguments are dependent. ExprResult CheckVarTemplateId(const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, - VarTemplateDecl *Template, NamedDecl *FoundD, + VarTemplateDecl *Template, SourceLocation TemplateLoc, const TemplateArgumentListInfo *TemplateArgs); diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 7d3d665194add..1a975a8d0a0df 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4958,10 +4958,11 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc, return Decl; } -ExprResult Sema::CheckVarTemplateId( - const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, - VarTemplateDecl *Template, NamedDecl *FoundD, SourceLocation TemplateLoc, - const TemplateArgumentListInfo *TemplateArgs) { +ExprResult +Sema::CheckVarTemplateId(const CXXScopeSpec &SS, + const DeclarationNameInfo &NameInfo, + VarTemplateDecl *Template, SourceLocation TemplateLoc, + const TemplateArgumentListInfo *TemplateArgs) { DeclResult Decl = CheckVarTemplateId(Template, TemplateLoc, NameInfo.getLoc(), *TemplateArgs); @@ -4977,7 +4978,8 @@ ExprResult Sema::CheckVarTemplateId( NameInfo.getLoc()); // Build an ordinary singleton decl ref. - return BuildDeclarationNameExpr(SS, NameInfo, Var, FoundD, TemplateArgs); + return BuildDeclarationNameExpr(SS, NameInfo, Var, + /*FoundD=*/nullptr, TemplateArgs); } void Sema::diagnoseMissingTemplateArguments(TemplateName Name, @@ -5064,9 +5066,9 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS, bool KnownDependent = false; // In C++1y, check variable template ids. if (R.getAsSingle()) { - ExprResult Res = CheckVarTemplateId( - SS, R.getLookupNameInfo(), R.getAsSingle(), - R.getRepresentativeDecl(), TemplateKWLoc, TemplateArgs); + ExprResult Res = CheckVarTemplateId(SS, R.getLookupNameInfo(), + R.getAsSingle(), + TemplateKWLoc, TemplateArgs); if (Res.isInvalid() || Res.isUsable()) return Res; // Result is dependent. Carry on to build an UnresolvedLookupEpxr. diff --git a/clang/test/AST/ast-dump-using.cpp b/clang/test/AST/ast-dump-using.cpp index 8e5c60d3aabf4..5a4e910ffb865 100644 --- a/clang/test/AST/ast-dump-using.cpp +++ b/clang/test/AST/ast-dump-using.cpp @@ -2,7 +2,6 @@ namespace a { struct S; -template T x = {}; } namespace b { using a::S; @@ -22,10 +21,4 @@ typedef S e; // check the same UsingType is reused. // CHECK-NEXT: `-UsingType [[TYPE_ADDR]] 'a::S' sugar // CHECK-NEXT: |-UsingShadow [[SHADOW_ADDR]] 'S' // CHECK-NEXT: `-RecordType {{.*}} 'a::S' -using a::x; - -void foo() { - x = 3; - // CHECK: DeclRefExpr {{.*}} 'x' {{.*}} (UsingShadow {{.*}} 'x') -} } From d31406b394307e5629372271f797f55c7ca9bbd3 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 21 Feb 2024 10:20:02 +0000 Subject: [PATCH 076/351] [flang][docs] Update llvm-test-suite docs (#81596) With some missing config options and a link to the test suite docs that explain how to setup `ISO_FORTRAN_C_HEADER` and set the stop message variable. --- flang/docs/FortranLLVMTestSuite.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md index 45485ef40106f..611e03cbad0eb 100644 --- a/flang/docs/FortranLLVMTestSuite.md +++ b/flang/docs/FortranLLVMTestSuite.md @@ -21,7 +21,9 @@ cmake -G "Ninja" -DCMAKE_C_COMPILER= \ -DCMAKE_Fortran_COMPILER= \ -DTEST_SUITE_COLLECT_CODE_SIZE:STRING=OFF \ -DTEST_SUITE_SUBDIRS:STRING="Fortran" \ - -DTEST_SUITE_FORTRAN:STRING=ON .. + -DTEST_SUITE_FORTRAN:STRING=ON \ + -DTEST_SUITE_LIT= \ + ``` This will configure the test-suite to run only the Fortran tests which @@ -29,10 +31,15 @@ are found in the Fortran subdirectory. To run the C/C++ tests alongside the Fortran tests omit the `-DTEST_SUITE_SUBDIRS` CMake variable. -If your Fortran compiler is Flang, you may want to set the `NO_STOP_MESSAGE` -environment variable to `1` in order to avoid test failures due to warnings -about INEXACT signaling exceptions. +If your Fortran compiler is Flang, there are a couple of other things you need +to do, which are explained +[here](https://github.com/llvm/llvm-test-suite/blob/main/Fortran/gfortran/README.md#usage). +Then to build and run the tests: +``` +ninja +ninja check +``` ## Running the SPEC CPU 2017 From bdeb3d47d185aedbe6af5eda5c91310e37938f5b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 10:07:07 +0000 Subject: [PATCH 077/351] [X86] Regenerate saddsat/ssubsat vector tests Adds missing avx512 constant broadcast comments --- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 2 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index e4f9217b9c19e..b2b242fa29818 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1236,7 +1236,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k2} = [9223372036854775807,9223372036854775807] ; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index cb15dbef33dec..64aead7041575 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1333,7 +1333,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 {%k2} = [9223372036854775807,9223372036854775807] ; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq From 3cb4f62de0eba62edd730d0ed80fd90d2826763d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 10:35:30 +0000 Subject: [PATCH 078/351] [X86] Regenerate vector tests to add missing avx512 constant broadcast comments --- .../CodeGen/X86/avx512vl-intrinsics-fast-isel.ll | 16 ++++++++-------- .../CodeGen/X86/prefer-avx256-mask-extend.ll | 4 ++-- llvm/test/CodeGen/X86/vector-bo-select.ll | 6 +++--- llvm/test/CodeGen/X86/vselect-avx.ll | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index 06e7096e430bb..87799c1e82fed 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -1905,13 +1905,13 @@ define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} +; X86-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} = [5,5,5,5] ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_set1_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} = [5,5,5,5] ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__O to <4 x i32> @@ -1927,13 +1927,13 @@ define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} {z} +; X86-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} {z} = [5,5,5,5] ; X86-NEXT: retl ; ; X64-LABEL: test_mm_maskz_set1_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 {%k1} {z} = [5,5,5,5] ; X64-NEXT: retq entry: %0 = bitcast i8 %__M to <8 x i1> @@ -1948,13 +1948,13 @@ define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1} +; X86-NEXT: vpbroadcastd {{.*#+}} ymm0 {%k1} = [5,5,5,5,5,5,5,5] ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_set1_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} +; X64-NEXT: vpbroadcastd {{.*#+}} ymm0 {%k1} = [5,5,5,5,5,5,5,5] ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__O to <8 x i32> @@ -1969,13 +1969,13 @@ define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1} {z} +; X86-NEXT: vpbroadcastd {{.*#+}} ymm0 {%k1} {z} = [5,5,5,5,5,5,5,5] ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_set1_epi32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z} +; X64-NEXT: vpbroadcastd {{.*#+}} ymm0 {%k1} {z} = [5,5,5,5,5,5,5,5] ; X64-NEXT: retq entry: %0 = bitcast i8 %__M to <8 x i1> diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index 98df54f110644..cca9d270fd498 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -194,7 +194,7 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) { ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512VL-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -206,7 +206,7 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) { ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 0c5d5dd86bb71..78797b9acc2e6 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -1270,7 +1270,7 @@ define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vbroadcastss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2 {%k1} +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 {%k1} = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512-NEXT: vdivps %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> , <16 x float> %y @@ -5783,7 +5783,7 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2 {%k1} +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 {%k1} = [1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm3 @@ -5844,7 +5844,7 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2 {%k1} +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 {%k1} = [1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 7b6a69475a9c4..8dda27145bd37 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -71,7 +71,7 @@ define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: movq (%rdi,%rsi,8), %rax ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] -; AVX512-NEXT: vbroadcastsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 {%k1} = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmovupd %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq From a0b3dbaf4b3c01dc7f0a83fce059a26360b58eb2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 10:46:02 +0000 Subject: [PATCH 079/351] [InstCombine] Regenerate some fcmp tests to use the update_test_checks.py script --- .../InstCombine/2008-05-23-CompareFold.ll | 15 ++++--- .../Transforms/InstCombine/2008-11-08-FCmp.ll | 39 ++++++++++++------- .../InstCombine/2009-05-23-FCmpToICmp.ll | 14 ++++--- .../Transforms/InstCombine/2012-02-13-FCmp.ll | 23 ++++++++++- 4 files changed, 65 insertions(+), 26 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/2008-05-23-CompareFold.ll b/llvm/test/Transforms/InstCombine/2008-05-23-CompareFold.ll index 66751eb704548..76b443bc22a73 100644 --- a/llvm/test/Transforms/InstCombine/2008-05-23-CompareFold.ll +++ b/llvm/test/Transforms/InstCombine/2008-05-23-CompareFold.ll @@ -1,14 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=instcombine -S < %s | FileCheck %s ; PR2359 -; CHECK-LABEL: @f( -; CHECK: ret i1 false define i1 @f(ptr %x) { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i1 false +; entry: - %tmp462 = load i8, ptr %x, align 1 ; [#uses=1] - %tmp462463 = sitofp i8 %tmp462 to float ; [#uses=1] - %tmp464 = fcmp ugt float %tmp462463, 0x47EFFFFFE0000000 ; - ret i1 %tmp464 + %tmp462 = load i8, ptr %x, align 1 ; [#uses=1] + %tmp462463 = sitofp i8 %tmp462 to float ; [#uses=1] + %tmp464 = fcmp ugt float %tmp462463, 0x47EFFFFFE0000000 ; + ret i1 %tmp464 } diff --git a/llvm/test/Transforms/InstCombine/2008-11-08-FCmp.ll b/llvm/test/Transforms/InstCombine/2008-11-08-FCmp.ll index fe3071bf0c01a..6df5ac40ca7c3 100644 --- a/llvm/test/Transforms/InstCombine/2008-11-08-FCmp.ll +++ b/llvm/test/Transforms/InstCombine/2008-11-08-FCmp.ll @@ -1,63 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instcombine -S | FileCheck %s ; PR3021 ; When inst combining an FCMP with the LHS coming from a uitofp instruction, we ; can't lower it to signed ICMP instructions. -; CHECK-LABEL: @test1( define i1 @test1(i32 %val) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[VAL:%.*]], 0 +; CHECK-NEXT: ret i1 [[TMP1]] +; %1 = uitofp i32 %val to double %2 = fcmp ole double %1, 0.000000e+00 -; CHECK: icmp eq i32 %val, 0 ret i1 %2 } -; CHECK-LABEL: @test2( define i1 @test2(i32 %val) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: ret i1 false +; %1 = uitofp i32 %val to double %2 = fcmp olt double %1, 0.000000e+00 ret i1 %2 -; CHECK: ret i1 false } -; CHECK-LABEL: @test3( define i1 @test3(i32 %val) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: ret i1 true +; %1 = uitofp i32 %val to double %2 = fcmp oge double %1, 0.000000e+00 ret i1 %2 -; CHECK: ret i1 true } -; CHECK-LABEL: @test4( define i1 @test4(i32 %val) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[VAL:%.*]], 0 +; CHECK-NEXT: ret i1 [[TMP1]] +; %1 = uitofp i32 %val to double %2 = fcmp ogt double %1, 0.000000e+00 -; CHECK: icmp ne i32 %val, 0 ret i1 %2 } -; CHECK-LABEL: @test5( define i1 @test5(i32 %val) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: ret i1 true +; %1 = uitofp i32 %val to double %2 = fcmp ogt double %1, -4.400000e+00 ret i1 %2 -; CHECK: ret i1 true } -; CHECK-LABEL: @test6( define i1 @test6(i32 %val) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: ret i1 false +; %1 = uitofp i32 %val to double %2 = fcmp olt double %1, -4.400000e+00 ret i1 %2 -; CHECK: ret i1 false } ; Check that optimizing unsigned >= comparisons correctly distinguishes ; positive and negative constants. -; CHECK-LABEL: @test7( define i1 @test7(i32 %val) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[VAL:%.*]], 3 +; CHECK-NEXT: ret i1 [[TMP1]] +; %1 = uitofp i32 %val to double %2 = fcmp oge double %1, 3.200000e+00 ret i1 %2 -; CHECK: icmp ugt i32 %val, 3 } diff --git a/llvm/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll b/llvm/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll index 7f32f947aa216..0ceaf5a61361e 100644 --- a/llvm/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll +++ b/llvm/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll @@ -1,9 +1,13 @@ -; RUN: opt < %s -passes=instcombine -S | not grep cmp +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s ; rdar://6903175 define i1 @f0(ptr %a) nounwind { - %b = load i32, ptr %a, align 4 - %c = uitofp i32 %b to double - %d = fcmp ogt double %c, 0x41EFFFFFFFE00000 - ret i1 %d +; CHECK-LABEL: @f0( +; CHECK-NEXT: ret i1 false +; + %b = load i32, ptr %a, align 4 + %c = uitofp i32 %b to double + %d = fcmp ogt double %c, 0x41EFFFFFFFE00000 + ret i1 %d } diff --git a/llvm/test/Transforms/InstCombine/2012-02-13-FCmp.ll b/llvm/test/Transforms/InstCombine/2012-02-13-FCmp.ll index 7366b50ed4b51..82c5817f06e2b 100644 --- a/llvm/test/Transforms/InstCombine/2012-02-13-FCmp.ll +++ b/llvm/test/Transforms/InstCombine/2012-02-13-FCmp.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=instcombine -S < %s | FileCheck %s ; Radar 10803727 @.str = private unnamed_addr constant [35 x i8] c"\0Ain_range input (should be 0): %f\0A\00", align 1 @@ -5,13 +6,33 @@ declare i32 @printf(ptr, ...) define i64 @_Z8tempCastj(i32 %val) uwtable ssp { +; CHECK-LABEL: @_Z8tempCastj( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str1, i32 [[VAL:%.*]]) +; CHECK-NEXT: [[CONV:%.*]] = uitofp i32 [[VAL]] to double +; CHECK-NEXT: [[CALL_I:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, double [[CONV]]) +; CHECK-NEXT: br i1 true, label [[LAND_RHS_I:%.*]], label [[IF_END_CRITEDGE:%.*]] +; CHECK: land.rhs.i: +; CHECK-NEXT: [[CMP1_I:%.*]] = icmp eq i32 [[VAL]], 0 +; CHECK-NEXT: br i1 [[CMP1_I]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 5.000000e-01 +; CHECK-NEXT: [[CONV3:%.*]] = fptosi double [[ADD]] to i64 +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: if.end.critedge: +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i64 [ [[CONV3]], [[IF_THEN]] ], [ -1, [[IF_END]] ] +; CHECK-NEXT: ret i64 [[RETVAL_0]] +; entry: %call = call i32 (ptr, ...) @printf(ptr @.str1, i32 %val) %conv = uitofp i32 %val to double %call.i = call i32 (ptr, ...) @printf(ptr @.str, double %conv) %cmp.i = fcmp oge double %conv, -1.000000e+00 br i1 %cmp.i, label %land.rhs.i, label %if.end.critedge -; CHECK: br i1 true, label %land.rhs.i, label %if.end.critedge land.rhs.i: ; preds = %entry %cmp1.i = fcmp olt double %conv, 1.000000e+00 From 6d160a49c2e7f36367de3f61f0460e28921450d5 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Wed, 21 Feb 2024 13:45:36 +0200 Subject: [PATCH 080/351] [AMDGPU][TableGen][NFC] Combine predicates without using classes. (#82346) Saves generating ~1200 instances of the PredConcat TableGen class. Also removes the default predicates from resulting predicate lists. --- llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td b/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td index 6c7c91ef464df..7c990aa6b2eb6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td @@ -11,11 +11,6 @@ def TruePredicate : Predicate<"">; // FIXME: Tablegen should specially supports this def FalsePredicate : Predicate<"false">; -// Add a predicate to the list if does not already exist to deduplicate it. -class PredConcat lst> { - list ret = !listconcat(lst, !listremove([pred], lst)); -} - // Prevent using other kinds of predicates where True16 predicates are // expected by giving them their own class. class True16PredicateClass : Predicate; @@ -28,9 +23,8 @@ class PredicateControl { True16PredicateClass True16Predicate = NoTrue16Predicate; list OtherPredicates = []; list Predicates = - PredConcat.ret>.ret>.ret>.ret; + !foldl(OtherPredicates, [SubtargetPredicate, AssemblerPredicate, + WaveSizePredicate, True16Predicate], + preds, p, + preds # !listremove([p], [TruePredicate, NoTrue16Predicate] # preds)); } From 6ce5159945997126b8a0f40f55e876c9fd882fc5 Mon Sep 17 00:00:00 2001 From: Sergei Lebedev <185856+superbobry@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:59:23 +0000 Subject: [PATCH 081/351] [MLIR][Python] Use ir.Value directly instead of _SubClassValueT (#82341) _SubClassValueT is only useful when it is has >1 usage in a signature. This was not true for the signatures produced by tblgen. For example def call(result, callee, operands_, *, loc=None, ip=None) -> _SubClassValueT: ... here a type checker does not have enough information to infer a type argument for _SubClassValueT, and thus effectively treats it as Any. --- mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi | 2 +- mlir/python/mlir/dialects/_ods_common.py | 7 ------- mlir/python/mlir/dialects/arith.py | 3 +-- mlir/test/mlir-tblgen/op-python-bindings.td | 1 - mlir/test/python/ir/value.py | 3 +-- mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp | 18 +++++++----------- 6 files changed, 10 insertions(+), 24 deletions(-) diff --git a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi index 3ed1872f1cd5a..93b978c75540f 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi @@ -10,4 +10,4 @@ class _Globals: def _check_dialect_module_loaded(self, dialect_namespace: str) -> bool: ... def register_dialect(dialect_class: type) -> object: ... -def register_operation(dialect_class: type) -> object: ... +def register_operation(dialect_class: type, *, replace: bool = ...) -> object: ... diff --git a/mlir/python/mlir/dialects/_ods_common.py b/mlir/python/mlir/dialects/_ods_common.py index 3af3b5ce73bc6..1e7e8244ed442 100644 --- a/mlir/python/mlir/dialects/_ods_common.py +++ b/mlir/python/mlir/dialects/_ods_common.py @@ -8,7 +8,6 @@ Sequence as _Sequence, Tuple as _Tuple, Type as _Type, - TypeVar as _TypeVar, Union as _Union, ) @@ -143,12 +142,6 @@ def get_op_result_or_op_results( else op ) - -# This is the standard way to indicate subclass/inheritance relationship -# see the typing.Type doc string. -_U = _TypeVar("_U", bound=_cext.ir.Value) -SubClassValueT = _Type[_U] - ResultValueTypeTuple = _cext.ir.Operation, _cext.ir.OpView, _cext.ir.Value ResultValueT = _Union[ResultValueTypeTuple] VariadicResultValueT = _Union[ResultValueT, _Sequence[ResultValueT]] diff --git a/mlir/python/mlir/dialects/arith.py b/mlir/python/mlir/dialects/arith.py index 663a53660a647..61c6917393f1f 100644 --- a/mlir/python/mlir/dialects/arith.py +++ b/mlir/python/mlir/dialects/arith.py @@ -12,7 +12,6 @@ get_default_loc_context as _get_default_loc_context, _cext as _ods_cext, get_op_result_or_op_results as _get_op_result_or_op_results, - SubClassValueT as _SubClassValueT, ) from typing import Any, List, Union @@ -81,5 +80,5 @@ def literal_value(self) -> Union[int, float]: def constant( result: Type, value: Union[int, float, Attribute], *, loc=None, ip=None -) -> _SubClassValueT: +) -> Value: return _get_op_result_or_op_results(ConstantOp(result, value, loc=loc, ip=ip)) diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td index dbed1164f1eb0..9f202ba08608c 100644 --- a/mlir/test/mlir-tblgen/op-python-bindings.td +++ b/mlir/test/mlir-tblgen/op-python-bindings.td @@ -7,7 +7,6 @@ include "mlir/Interfaces/InferTypeOpInterface.td" // CHECK: @_ods_cext.register_dialect // CHECK: class _Dialect(_ods_ir.Dialect): // CHECK: DIALECT_NAMESPACE = "test" - // CHECK: pass def Test_Dialect : Dialect { let name = "test"; let cppNamespace = "Test"; diff --git a/mlir/test/python/ir/value.py b/mlir/test/python/ir/value.py index 28ef0f2ef3e25..50b0e8403a7f2 100644 --- a/mlir/test/python/ir/value.py +++ b/mlir/test/python/ir/value.py @@ -3,7 +3,6 @@ import gc from mlir.ir import * from mlir.dialects import func -from mlir.dialects._ods_common import SubClassValueT def run(f): @@ -270,7 +269,7 @@ def __str__(self): return super().__str__().replace(Value.__name__, NOPBlockArg.__name__) @register_value_caster(IntegerType.static_typeid) - def cast_int(v) -> SubClassValueT: + def cast_int(v) -> Value: print("in caster", v.__class__.__name__) if isinstance(v, OpResult): return NOPResult(v) diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp index 640360eff734a..814008c254511 100644 --- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp +++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp @@ -31,7 +31,6 @@ constexpr const char *fileHeader = R"Py( from ._ods_common import _cext as _ods_cext from ._ods_common import ( - SubClassValueT as _SubClassValueT, equally_sized_accessor as _ods_equally_sized_accessor, get_default_loc_context as _ods_get_default_loc_context, get_op_result_or_op_results as _get_op_result_or_op_results, @@ -52,8 +51,6 @@ constexpr const char *dialectClassTemplate = R"Py( @_ods_cext.register_dialect class _Dialect(_ods_ir.Dialect): DIALECT_NAMESPACE = "{0}" - pass - )Py"; constexpr const char *dialectExtensionTemplate = R"Py( @@ -1007,14 +1004,13 @@ static void emitValueBuilder(const Operator &op, }); std::string nameWithoutDialect = op.getOperationName().substr(op.getOperationName().find('.') + 1); - os << llvm::formatv(valueBuilderTemplate, sanitizeName(nameWithoutDialect), - op.getCppClassName(), - llvm::join(valueBuilderParams, ", "), - llvm::join(opBuilderArgs, ", "), - (op.getNumResults() > 1 - ? "_Sequence[_SubClassValueT]" - : (op.getNumResults() > 0 ? "_SubClassValueT" - : "_ods_ir.Operation"))); + os << llvm::formatv( + valueBuilderTemplate, sanitizeName(nameWithoutDialect), + op.getCppClassName(), llvm::join(valueBuilderParams, ", "), + llvm::join(opBuilderArgs, ", "), + (op.getNumResults() > 1 + ? "_Sequence[_ods_ir.Value]" + : (op.getNumResults() > 0 ? "_ods_ir.Value" : "_ods_ir.Operation"))); } /// Emits bindings for a specific Op to the given output stream. From b1080e187e91576ac6d44087f072583e101f0f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 20 Feb 2024 16:35:16 +0100 Subject: [PATCH 082/351] [clang][Interp] Convert complex initializers to rvalues We internalle handle these via pointers, but we need to return them as RValues in initializers. --- clang/lib/AST/Interp/EvalEmitter.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp index d90cf1812bb77..9cae25f5c4d64 100644 --- a/clang/lib/AST/Interp/EvalEmitter.cpp +++ b/clang/lib/AST/Interp/EvalEmitter.cpp @@ -47,6 +47,9 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E, EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD, bool CheckFullyInitialized) { this->CheckFullyInitialized = CheckFullyInitialized; + this->ConvertResultToRValue = + VD->getAnyInitializer() && + (VD->getAnyInitializer()->getType()->isAnyComplexType()); EvalResult.setSource(VD); if (!this->visitDecl(VD) && EvalResult.empty()) From 40fae67a50e08e6b5b5300210021218e404d63a7 Mon Sep 17 00:00:00 2001 From: harishch4 Date: Wed, 21 Feb 2024 17:44:54 +0530 Subject: [PATCH 083/351] [Flang][OpenMP] Fix to construct-names inside OpenMP construct with default(none) (#82479) When a do loop with a construct-name is used inside OpenMP construct with default(none), an incorrect error will be raised as below. ``` program cn_and_default implicit none integer :: i !$omp parallel default(none) loop: do i = 1, 10 end do loop !$omp end parallel end program ``` > The DEFAULT(NONE) clause requires that 'loop' must be listed in a data-sharing attribute clause This patch fixes this by adding a condition to check and skip processing construct-names. --- flang/lib/Semantics/resolve-directives.cpp | 6 ++++++ flang/test/Semantics/OpenMP/default-none.f90 | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 4b6d083671bc9..a826f0181e580 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -1982,6 +1982,12 @@ void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) { void OmpAttributeVisitor::Post(const parser::Name &name) { auto *symbol{name.symbol}; if (symbol && !dirContext_.empty() && GetContext().withinConstruct) { + // Exclude construct-names + if (auto *details{symbol->detailsIf()}) { + if (details->kind() == semantics::MiscDetails::Kind::ConstructName) { + return; + } + } if (!symbol->owner().IsDerivedType() && !IsProcedure(*symbol) && !IsObjectWithDSA(*symbol) && !IsNamedConstant(*symbol)) { // TODO: create a separate function to go through the rules for diff --git a/flang/test/Semantics/OpenMP/default-none.f90 b/flang/test/Semantics/OpenMP/default-none.f90 index d027f46f00584..11ba878ea7794 100644 --- a/flang/test/Semantics/OpenMP/default-none.f90 +++ b/flang/test/Semantics/OpenMP/default-none.f90 @@ -39,3 +39,11 @@ subroutine sb3(x) print *, x end subroutine end subroutine + +!construct-name inside default(none) +subroutine sb4 + !$omp parallel default(none) + loop: do i = 1, 10 + end do loop + !$omp end parallel +end subroutine From e209178d6402348414b69941c77d621919b3b7ab Mon Sep 17 00:00:00 2001 From: Vedant Paranjape Date: Wed, 21 Feb 2024 17:51:56 +0530 Subject: [PATCH 084/351] [SimplifyIndVar] LCSSA form is destroyed by simplifyLoopIVs, preserve it (#78696) In LoopUnroll, peelLoop is called on the loop. After the loop is peeled it calls simplifyLoopAfterUnroll on the loop. This call to simplifyLoopAfterUnroll doesn't preserve the LCSSA form of the parent loop and thus during the next call to peelLoop the LCSSA form is already broken. LoopPeel util takes in the PreserveLCSSA argument and it passes on the same argument to simplifyLoop which checks if the loop is in a valid LCSSA form, when (PreserveLCSSA = true). This causes an assert in simplifyLoop when (PreserveLCSSA = true), as during the last call LCSSA for the loop wasn't preserved, and thus crashes at the following assert. assert(L->isRecursivelyLCSSAForm(*DT, *LI) && "Requested to preserve LCSSA, but it's already broken."); Upon debugging, it is evident that simplifyLoopIVs call inside simplifyLoopAfterUnroll breaks the LCSSA form. This patch fixes llvm#77118, it checks if the replacement of IV Users with Loop Invariant preserves the LCSSA form. If it does not, it emits the required LCSSA Phi instructions. --- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 12 ++++ .../gh-issue77118-broken-lcssa-form.ll | 56 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 llvm/test/Transforms/LoopUnroll/gh-issue77118-broken-lcssa-form.ll diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 66bba1ca2f1d7..297cfe5124d85 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; @@ -643,10 +644,21 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) { } auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP); + bool NeedToEmitLCSSAPhis = false; + if (!LI->replacementPreservesLCSSAForm(I, Invariant)) + NeedToEmitLCSSAPhis = true; I->replaceAllUsesWith(Invariant); LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I << " with loop invariant: " << *S << '\n'); + + if (NeedToEmitLCSSAPhis) { + SmallVector NeedsLCSSAPhis; + NeedsLCSSAPhis.push_back(cast(Invariant)); + formLCSSAForInstructions(NeedsLCSSAPhis, *DT, *LI, SE); + LLVM_DEBUG(dbgs() << " INDVARS: Replacement breaks LCSSA form" + << " inserting LCSSA Phis" << '\n'); + } ++NumFoldedUser; Changed = true; DeadInsts.emplace_back(I); diff --git a/llvm/test/Transforms/LoopUnroll/gh-issue77118-broken-lcssa-form.ll b/llvm/test/Transforms/LoopUnroll/gh-issue77118-broken-lcssa-form.ll new file mode 100644 index 0000000000000..2f07b81d888b8 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/gh-issue77118-broken-lcssa-form.ll @@ -0,0 +1,56 @@ +; RUN: opt -passes=loop-unroll -unroll-peel-count=2 -S -disable-output -debug-only=loop-unroll < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +define void @test() { +; CHECK-LABEL: Loop Unroll: F[test] Loop %loop3 +; CHECK-NEXT: Loop Size = 7 +; CHECK-NEXT: PEELING loop %loop3 with iteration count 2! +; CHECK-NEXT: Loop Unroll: F[test] Loop %loop2 +; CHECK-NEXT: Loop Size = 28 +; CHECK-NEXT: PEELING loop %loop2 with iteration count 2! +; CHECK-NEXT: Loop Unroll: F[test] Loop %loop4 +; CHECK-NEXT: Loop Size = 3 +; CHECK-NEXT: PEELING loop %loop4 with iteration count 2! +; CHECK-NEXT: Loop Unroll: F[test] Loop %loop1 +; CHECK-NEXT: Loop Size = 95 +; CHECK-NEXT: PEELING loop %loop1 with iteration count 2! +entry: + br label %loop1 + +loop1: + %phi = phi i32 [ 1, %entry ], [ 0, %loop1.latch ] + br label %loop2 + +loop2: + %phi3 = phi i64 [ 0, %loop1 ], [ %sext, %loop2.latch ] + br label %loop3 + +loop3: + %phi5 = phi i64 [ %phi3, %loop2 ], [ %sext, %loop3.latch ] + %phi6 = phi i32 [ 1, %loop2 ], [ %add10, %loop3.latch ] + %trunc = trunc i64 %phi5 to i32 + br i1 true, label %loop3.latch, label %exit + +loop3.latch: + %add = add i32 1, %phi + %sext = sext i32 %add to i64 + %add10 = add i32 %phi6, 1 + %icmp = icmp ugt i32 %add10, 2 + br i1 %icmp, label %loop2.latch, label %loop3 + +loop2.latch: + br i1 false, label %loop4.preheader, label %loop2 + +loop4.preheader: + br label %loop4 + +loop4: + br i1 false, label %loop1.latch, label %loop4 + +loop1.latch: + br label %loop1 + +exit: + %phi8 = phi i32 [ %trunc, %loop3 ] + ret void +} From 02e17ab1b97a8c0dc22facc8c66850e5aca28b60 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Wed, 21 Feb 2024 12:29:28 +0000 Subject: [PATCH 085/351] [AArch64] Added feature dependencies for SME2p1 to TargetParser (#81860) This patches adds missing target-feature dependencies for SME2.1 --- clang/test/Sema/aarch64-sme2p1-diagnostics.c | 10 ++++++++++ llvm/include/llvm/TargetParser/AArch64TargetParser.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 clang/test/Sema/aarch64-sme2p1-diagnostics.c diff --git a/clang/test/Sema/aarch64-sme2p1-diagnostics.c b/clang/test/Sema/aarch64-sme2p1-diagnostics.c new file mode 100644 index 0000000000000..a0adb04038581 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2p1-diagnostics.c @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target +#include "arm_sme.h" + +svuint8x2_t test_sme2p1(svuint8x2_t x) { + // expected-no-diagnostics + return x; +} + diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index ed9944bcef23d..7376ac98a2b09 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -273,7 +273,7 @@ inline constexpr ExtensionInfo Extensions[] = { {"sme-i16i64", AArch64::AEK_SMEI16I64, "+sme-i16i64", "-sme-i16i64", FEAT_SME_I64, "+sme,+sme-i16i64,+bf16", 570}, {"sme", AArch64::AEK_SME, "+sme", "-sme", FEAT_SME, "+sme,+bf16", 430}, {"sme2", AArch64::AEK_SME2, "+sme2", "-sme2", FEAT_SME2, "+sme2,+sme,+bf16", 580}, - {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_INIT, "", 0}, + {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_INIT, "+sme2p1,+sme2,+sme,+bf16", 0}, {"ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs", FEAT_SSBS, "", 490}, {"ssbs2", AArch64::AEK_NONE, {}, {}, FEAT_SSBS2, "+ssbs", 500}, {"sve-bf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_BF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320}, From 28fb2b33c2f43f6a8057e398eb899eb61e6652e9 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 21 Feb 2024 13:03:24 +0000 Subject: [PATCH 086/351] [LLVM][SelectionDAG] Reduce number of ComputeValueVTs variants. (#75614) This is another step in the direction of fixing the `Fixed(0) != Scalable(0)` bugbear, although whilst weird I don't believe it's causing us any real issues. --- llvm/include/llvm/CodeGen/Analysis.h | 35 ++++++------- llvm/include/llvm/Support/TypeSize.h | 1 + llvm/lib/CodeGen/Analysis.cpp | 49 +++---------------- .../SelectionDAG/SelectionDAGBuilder.cpp | 4 +- .../InstCombineLoadStoreAlloca.cpp | 4 +- llvm/unittests/Support/TypeSizeTest.cpp | 3 ++ 6 files changed, 29 insertions(+), 67 deletions(-) diff --git a/llvm/include/llvm/CodeGen/Analysis.h b/llvm/include/llvm/CodeGen/Analysis.h index 1c67fe2d003d9..6f7ed22b8ac71 100644 --- a/llvm/include/llvm/CodeGen/Analysis.h +++ b/llvm/include/llvm/CodeGen/Analysis.h @@ -62,36 +62,31 @@ inline unsigned ComputeLinearIndex(Type *Ty, /// If Offsets is non-null, it points to a vector to be filled in /// with the in-memory offsets of each of the individual values. /// -void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, - SmallVectorImpl &ValueVTs, - SmallVectorImpl *Offsets, - TypeSize StartingOffset); -void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, - SmallVectorImpl &ValueVTs, - SmallVectorImpl *Offsets = nullptr, - uint64_t StartingOffset = 0); -void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, - SmallVectorImpl &ValueVTs, - SmallVectorImpl *FixedOffsets, - uint64_t StartingOffset); - -/// Variant of ComputeValueVTs that also produces the memory VTs. -void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, - SmallVectorImpl &ValueVTs, - SmallVectorImpl *MemVTs, - SmallVectorImpl *Offsets, - TypeSize StartingOffset); void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl &ValueVTs, SmallVectorImpl *MemVTs, SmallVectorImpl *Offsets = nullptr, - uint64_t StartingOffset = 0); + TypeSize StartingOffset = TypeSize::getZero()); void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl &ValueVTs, SmallVectorImpl *MemVTs, SmallVectorImpl *FixedOffsets, uint64_t StartingOffset); +/// Variant of ComputeValueVTs that don't produce memory VTs. +inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl &ValueVTs, + SmallVectorImpl *Offsets = nullptr, + TypeSize StartingOffset = TypeSize::getZero()) { + ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offsets, StartingOffset); +} +inline void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl &ValueVTs, + SmallVectorImpl *FixedOffsets, + uint64_t StartingOffset) { + ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, FixedOffsets, StartingOffset); +} + /// computeValueLLTs - Given an LLVM IR type, compute a sequence of /// LLTs that represent all the individual underlying /// non-aggregate types that comprise it. diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index b00ebf9e8c454..1b793b0eccf3c 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -335,6 +335,7 @@ class TypeSize : public details::FixedOrScalableQuantity { static constexpr TypeSize getScalable(ScalarTy MinimumSize) { return TypeSize(MinimumSize, true); } + static constexpr TypeSize getZero() { return TypeSize(0, false); } // All code for this class below this point is needed because of the // temporary implicit conversion to uint64_t. The operator overloads are diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 1994e6aec84b2..af7643d93591f 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -81,6 +81,9 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, SmallVectorImpl *MemVTs, SmallVectorImpl *Offsets, TypeSize StartingOffset) { + assert((Ty->isScalableTy() == StartingOffset.isScalable() || + StartingOffset.isZero()) && + "Offset/TypeSize mismatch!"); // Given a struct type, recursively traverse the elements. if (StructType *STy = dyn_cast(Ty)) { // If the Offsets aren't needed, don't query the struct layout. This allows @@ -92,8 +95,8 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, EE = STy->element_end(); EI != EE; ++EI) { // Don't compute the element offset if we didn't get a StructLayout above. - TypeSize EltOffset = SL ? SL->getElementOffset(EI - EB) - : TypeSize::get(0, StartingOffset.isScalable()); + TypeSize EltOffset = + SL ? SL->getElementOffset(EI - EB) : TypeSize::getZero(); ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets, StartingOffset + EltOffset); } @@ -119,52 +122,12 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Offsets->push_back(StartingOffset); } -void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl &ValueVTs, - SmallVectorImpl *Offsets, - TypeSize StartingOffset) { - return ComputeValueVTs(TLI, DL, Ty, ValueVTs, /*MemVTs=*/nullptr, Offsets, - StartingOffset); -} - -void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl &ValueVTs, - SmallVectorImpl *Offsets, - uint64_t StartingOffset) { - TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); - return ComputeValueVTs(TLI, DL, Ty, ValueVTs, Offsets, Offset); -} - -void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl &ValueVTs, - SmallVectorImpl *FixedOffsets, - uint64_t StartingOffset) { - TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); - if (FixedOffsets) { - SmallVector Offsets; - ComputeValueVTs(TLI, DL, Ty, ValueVTs, &Offsets, Offset); - for (TypeSize Offset : Offsets) - FixedOffsets->push_back(Offset.getFixedValue()); - } else { - ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offset); - } -} - -void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl &ValueVTs, - SmallVectorImpl *MemVTs, - SmallVectorImpl *Offsets, - uint64_t StartingOffset) { - TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); - return ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, Offsets, Offset); -} - void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl &ValueVTs, SmallVectorImpl *MemVTs, SmallVectorImpl *FixedOffsets, uint64_t StartingOffset) { - TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); + TypeSize Offset = TypeSize::getFixed(StartingOffset); if (FixedOffsets) { SmallVector Offsets; ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, &Offsets, Offset); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2bdf48643edc3..e893a5b616d33 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4332,7 +4332,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { Type *Ty = I.getType(); SmallVector ValueVTs, MemVTs; SmallVector Offsets; - ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets, 0); + ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; @@ -4500,7 +4500,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { SmallVector ValueVTs, MemVTs; SmallVector Offsets; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), - SrcV->getType(), ValueVTs, &MemVTs, &Offsets, 0); + SrcV->getType(), ValueVTs, &MemVTs, &Offsets); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 1254a050027a4..a222889842f54 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -777,7 +777,7 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { auto *Zero = ConstantInt::get(IdxType, 0); Value *V = PoisonValue::get(T); - TypeSize Offset = TypeSize::get(0, ET->isScalableTy()); + TypeSize Offset = TypeSize::getZero(); for (uint64_t i = 0; i < NumElements; i++) { Value *Indices[2] = { Zero, @@ -1303,7 +1303,7 @@ static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) { auto *IdxType = Type::getInt64Ty(T->getContext()); auto *Zero = ConstantInt::get(IdxType, 0); - TypeSize Offset = TypeSize::get(0, AT->getElementType()->isScalableTy()); + TypeSize Offset = TypeSize::getZero(); for (uint64_t i = 0; i < NumElements; i++) { Value *Indices[2] = { Zero, diff --git a/llvm/unittests/Support/TypeSizeTest.cpp b/llvm/unittests/Support/TypeSizeTest.cpp index 503dc5d99b182..34fe376989e7b 100644 --- a/llvm/unittests/Support/TypeSizeTest.cpp +++ b/llvm/unittests/Support/TypeSizeTest.cpp @@ -82,9 +82,12 @@ static_assert(UINT64_C(2) * TSFixed32 == TypeSize::getFixed(64)); static_assert(alignTo(TypeSize::getFixed(7), 8) == TypeSize::getFixed(8)); static_assert(TypeSize() == TypeSize::getFixed(0)); +static_assert(TypeSize::getZero() == TypeSize::getFixed(0)); +static_assert(TypeSize::getZero() != TypeSize::getScalable(0)); static_assert(TypeSize::getFixed(0) != TypeSize::getScalable(0)); static_assert(TypeSize::getFixed(0).isZero()); static_assert(TypeSize::getScalable(0).isZero()); +static_assert(TypeSize::getZero().isZero()); static_assert(TypeSize::getFixed(0) == (TypeSize::getFixed(4) - TypeSize::getFixed(4))); static_assert(TypeSize::getScalable(0) == From c50ca3daa445f7e54343fb365339181185ee0f2c Mon Sep 17 00:00:00 2001 From: Chia Date: Wed, 21 Feb 2024 22:06:40 +0900 Subject: [PATCH 087/351] [RISCV][ISel] Combine vector fadd/fsub/fmul with fp extend. (#81248) Extend D133739 and #76785 to support vector widening floating-point add/sub/mul instructions. Specifically, this patch works for the below optimization case: ### Source code ``` define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) { %c = fpext <2 x float> %a to <2 x double> %d = fpext <2 x float> %b to <2 x double> %d2 = fpext <2 x float> %b2 to <2 x double> %e = fmul <2 x double> %c, %d %f = fadd <2 x double> %c, %d2 %g = fsub <2 x double> %d, %d2 store <2 x double> %e, ptr %x store <2 x double> %f, ptr %y store <2 x double> %g, ptr %z ret void } ``` ### Before this patch [Compiler Explorer](https://godbolt.org/z/aaEMs5s9h) ``` vfwmul_v2f32_multiple_users: vsetivli zero, 2, e32, mf2, ta, ma vfwcvt.f.f.v v11, v8 vfwcvt.f.f.v v8, v9 vfwcvt.f.f.v v9, v10 vsetvli zero, zero, e64, m1, ta, ma vfmul.vv v10, v11, v8 vfadd.vv v11, v11, v9 vfsub.vv v8, v8, v9 vse64.v v10, (a0) vse64.v v11, (a1) vse64.v v8, (a2) ret ``` ### After this patch ``` vfwmul_v2f32_multiple_users: vsetivli zero, 2, e32, mf2, ta, ma vfwmul.vv v11, v8, v9 vfwadd.vv v12, v8, v10 vfwsub.vv v8, v9, v10 vse64.v v11, (a0) vse64.v v12, (a1) vse64.v v8, (a2) ``` --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 386 ++++++++++-------- .../fixed-vectors-vfw-web-simplification.ll | 88 ++++ .../CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll | 8 +- .../CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll | 14 +- .../CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll | 14 +- 5 files changed, 307 insertions(+), 203 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c2fef4993f6ec..812bb26f201a0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13316,12 +13316,15 @@ namespace { // apply a combine. struct CombineResult; +enum ExtKind : uint8_t { ZExt = 1 << 0, SExt = 1 << 1, FPExt = 1 << 2 }; /// Helper class for folding sign/zero extensions. /// In particular, this class is used for the following combines: /// add | add_vl -> vwadd(u) | vwadd(u)_w /// sub | sub_vl -> vwsub(u) | vwsub(u)_w /// mul | mul_vl -> vwmul(u) | vwmul_su -/// +/// fadd -> vfwadd | vfwadd_w +/// fsub -> vfwsub | vfwsub_w +/// fmul -> vfwmul /// An object of this class represents an operand of the operation we want to /// combine. /// E.g., when trying to combine `mul_vl a, b`, we will have one instance of @@ -13335,7 +13338,8 @@ struct CombineResult; /// - VWADDU_W == add(op0, zext(op1)) /// - VWSUB_W == sub(op0, sext(op1)) /// - VWSUBU_W == sub(op0, zext(op1)) -/// +/// - VFWADD_W == fadd(op0, fpext(op1)) +/// - VFWSUB_W == fsub(op0, fpext(op1)) /// And VMV_V_X_VL, depending on the value, is conceptually equivalent to /// zext|sext(smaller_value). struct NodeExtensionHelper { @@ -13346,6 +13350,8 @@ struct NodeExtensionHelper { /// instance, a splat constant (e.g., 3), would support being both sign and /// zero extended. bool SupportsSExt; + /// Records if this operand is like being floating-Point extended. + bool SupportsFPExt; /// This boolean captures whether we care if this operand would still be /// around after the folding happens. bool EnforceOneUse; @@ -13369,6 +13375,7 @@ struct NodeExtensionHelper { case ISD::SIGN_EXTEND: case RISCVISD::VSEXT_VL: case RISCVISD::VZEXT_VL: + case RISCVISD::FP_EXTEND_VL: return OrigOperand.getOperand(0); default: return OrigOperand; @@ -13380,22 +13387,34 @@ struct NodeExtensionHelper { return OrigOperand.getOpcode() == RISCVISD::VMV_V_X_VL; } + /// Get the extended opcode. + unsigned getExtOpc(ExtKind SupportsExt) const { + switch (SupportsExt) { + case ExtKind::SExt: + return RISCVISD::VSEXT_VL; + case ExtKind::ZExt: + return RISCVISD::VZEXT_VL; + case ExtKind::FPExt: + return RISCVISD::FP_EXTEND_VL; + } + } + /// Get or create a value that can feed \p Root with the given extension \p - /// SExt. If \p SExt is std::nullopt, this returns the source of this operand. - /// \see ::getSource(). + /// SupportsExt. If \p SExt is std::nullopt, this returns the source of this + /// operand. \see ::getSource(). SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG, const RISCVSubtarget &Subtarget, - std::optional SExt) const { - if (!SExt.has_value()) + std::optional SupportsExt) const { + if (!SupportsExt.has_value()) return OrigOperand; - MVT NarrowVT = getNarrowType(Root); + MVT NarrowVT = getNarrowType(Root, *SupportsExt); SDValue Source = getSource(); if (Source.getValueType() == NarrowVT) return Source; - unsigned ExtOpc = *SExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL; + unsigned ExtOpc = getExtOpc(*SupportsExt); // If we need an extension, we should be changing the type. SDLoc DL(OrigOperand); @@ -13405,6 +13424,7 @@ struct NodeExtensionHelper { case ISD::SIGN_EXTEND: case RISCVISD::VSEXT_VL: case RISCVISD::VZEXT_VL: + case RISCVISD::FP_EXTEND_VL: return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL); case RISCVISD::VMV_V_X_VL: return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, @@ -13420,41 +13440,79 @@ struct NodeExtensionHelper { /// Helper function to get the narrow type for \p Root. /// The narrow type is the type of \p Root where we divided the size of each /// element by 2. E.g., if Root's type <2xi16> -> narrow type <2xi8>. - /// \pre The size of the type of the elements of Root must be a multiple of 2 - /// and be greater than 16. - static MVT getNarrowType(const SDNode *Root) { + /// \pre Both the narrow type and the original type should be legal. + static MVT getNarrowType(const SDNode *Root, ExtKind SupportsExt) { MVT VT = Root->getSimpleValueType(0); // Determine the narrow size. unsigned NarrowSize = VT.getScalarSizeInBits() / 2; - assert(NarrowSize >= 8 && "Trying to extend something we can't represent"); - MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize), - VT.getVectorElementCount()); + + MVT EltVT = SupportsExt == ExtKind::FPExt + ? MVT::getFloatingPointVT(NarrowSize) + : MVT::getIntegerVT(NarrowSize); + + assert(NarrowSize >= (SupportsExt == ExtKind::FPExt ? 16 : 8) && + "Trying to extend something we can't represent"); + MVT NarrowVT = MVT::getVectorVT(EltVT, VT.getVectorElementCount()); return NarrowVT; } - /// Return the opcode required to materialize the folding of the sign - /// extensions (\p IsSExt == true) or zero extensions (IsSExt == false) for - /// both operands for \p Opcode. - /// Put differently, get the opcode to materialize: - /// - ISExt == true: \p Opcode(sext(a), sext(b)) -> newOpcode(a, b) - /// - ISExt == false: \p Opcode(zext(a), zext(b)) -> newOpcode(a, b) - /// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()). - static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) { + /// Get the opcode to materialize: + /// Opcode(sext(a), sext(b)) -> newOpcode(a, b) + static unsigned getSExtOpcode(unsigned Opcode) { switch (Opcode) { case ISD::ADD: case RISCVISD::ADD_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: - return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL; + return RISCVISD::VWADD_VL; + case ISD::SUB: + case RISCVISD::SUB_VL: + case RISCVISD::VWSUB_W_VL: + case RISCVISD::VWSUBU_W_VL: + return RISCVISD::VWSUB_VL; case ISD::MUL: case RISCVISD::MUL_VL: - return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; + return RISCVISD::VWMUL_VL; + default: + llvm_unreachable("Unexpected opcode"); + } + } + + /// Get the opcode to materialize: + /// Opcode(zext(a), zext(b)) -> newOpcode(a, b) + static unsigned getZExtOpcode(unsigned Opcode) { + switch (Opcode) { + case ISD::ADD: + case RISCVISD::ADD_VL: + case RISCVISD::VWADD_W_VL: + case RISCVISD::VWADDU_W_VL: + return RISCVISD::VWADDU_VL; case ISD::SUB: case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: - return IsSExt ? RISCVISD::VWSUB_VL : RISCVISD::VWSUBU_VL; + return RISCVISD::VWSUBU_VL; + case ISD::MUL: + case RISCVISD::MUL_VL: + return RISCVISD::VWMULU_VL; + default: + llvm_unreachable("Unexpected opcode"); + } + } + + /// Get the opcode to materialize: + /// Opcode(fpext(a), fpext(b)) -> newOpcode(a, b) + static unsigned getFPExtOpcode(unsigned Opcode) { + switch (Opcode) { + case RISCVISD::FADD_VL: + case RISCVISD::VFWADD_W_VL: + return RISCVISD::VFWADD_VL; + case RISCVISD::FSUB_VL: + case RISCVISD::VFWSUB_W_VL: + return RISCVISD::VFWSUB_VL; + case RISCVISD::FMUL_VL: + return RISCVISD::VFWMUL_VL; default: llvm_unreachable("Unexpected opcode"); } @@ -13468,16 +13526,22 @@ struct NodeExtensionHelper { return RISCVISD::VWMULSU_VL; } - /// Get the opcode to materialize \p Opcode(a, s|zext(b)) -> - /// newOpcode(a, b). - static unsigned getWOpcode(unsigned Opcode, bool IsSExt) { + /// Get the opcode to materialize + /// \p Opcode(a, s|z|fpext(b)) -> newOpcode(a, b). + static unsigned getWOpcode(unsigned Opcode, ExtKind SupportsExt) { switch (Opcode) { case ISD::ADD: case RISCVISD::ADD_VL: - return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL; + return SupportsExt == ExtKind::SExt ? RISCVISD::VWADD_W_VL + : RISCVISD::VWADDU_W_VL; case ISD::SUB: case RISCVISD::SUB_VL: - return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL; + return SupportsExt == ExtKind::SExt ? RISCVISD::VWSUB_W_VL + : RISCVISD::VWSUBU_W_VL; + case RISCVISD::FADD_VL: + return RISCVISD::VFWADD_W_VL; + case RISCVISD::FSUB_VL: + return RISCVISD::VFWSUB_W_VL; default: llvm_unreachable("Unexpected opcode"); } @@ -13497,6 +13561,7 @@ struct NodeExtensionHelper { const RISCVSubtarget &Subtarget) { SupportsZExt = false; SupportsSExt = false; + SupportsFPExt = false; EnforceOneUse = true; CheckMask = true; unsigned Opc = OrigOperand.getOpcode(); @@ -13538,6 +13603,11 @@ struct NodeExtensionHelper { Mask = OrigOperand.getOperand(1); VL = OrigOperand.getOperand(2); break; + case RISCVISD::FP_EXTEND_VL: + SupportsFPExt = true; + Mask = OrigOperand.getOperand(1); + VL = OrigOperand.getOperand(2); + break; case RISCVISD::VMV_V_X_VL: { // Historically, we didn't care about splat values not disappearing during // combines. @@ -13584,15 +13654,16 @@ struct NodeExtensionHelper { /// Check if \p Root supports any extension folding combines. static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (Root->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(Root->getValueType(0))) return false; return Root->getValueType(0).isScalableVector(); } + // Vector Widening Integer Add/Sub/Mul Instructions case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: @@ -13600,7 +13671,13 @@ struct NodeExtensionHelper { case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: - return true; + // Vector Widening Floating-Point Add/Sub/Mul Instructions + case RISCVISD::FADD_VL: + case RISCVISD::FSUB_VL: + case RISCVISD::FMUL_VL: + case RISCVISD::VFWADD_W_VL: + case RISCVISD::VFWSUB_W_VL: + return TLI.isTypeLegal(Root->getValueType(0)); default: return false; } @@ -13616,16 +13693,23 @@ struct NodeExtensionHelper { unsigned Opc = Root->getOpcode(); switch (Opc) { - // We consider VW(U)_W(LHS, RHS) as if they were - // (LHS, S|ZEXT(RHS)) + // We consider + // VW_W(LHS, RHS) -> (LHS, SEXT(RHS)) + // VWU_W(LHS, RHS) -> (LHS, ZEXT(RHS)) + // VFW_W(LHS, RHS) -> F(LHS, FPEXT(RHS)) case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: + case RISCVISD::VFWADD_W_VL: + case RISCVISD::VFWSUB_W_VL: if (OperandIdx == 1) { SupportsZExt = Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL; - SupportsSExt = !SupportsZExt; + SupportsSExt = + Opc == RISCVISD::VWADD_W_VL || Opc == RISCVISD::VWSUB_W_VL; + SupportsFPExt = + Opc == RISCVISD::VFWADD_W_VL || Opc == RISCVISD::VFWSUB_W_VL; std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget); CheckMask = true; // There's no existing extension here, so we don't have to worry about @@ -13685,11 +13769,16 @@ struct NodeExtensionHelper { case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: + case RISCVISD::FADD_VL: + case RISCVISD::FMUL_VL: + case RISCVISD::VFWADD_W_VL: return true; case ISD::SUB: case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: + case RISCVISD::FSUB_VL: + case RISCVISD::VFWSUB_W_VL: return false; default: llvm_unreachable("Unexpected opcode"); @@ -13711,10 +13800,9 @@ struct NodeExtensionHelper { struct CombineResult { /// Opcode to be generated when materializing the combine. unsigned TargetOpcode; - // No value means no extension is needed. If extension is needed, the value - // indicates if it needs to be sign extended. - std::optional SExtLHS; - std::optional SExtRHS; + // No value means no extension is needed. + std::optional LHSExt; + std::optional RHSExt; /// Root of the combine. SDNode *Root; /// LHS of the TargetOpcode. @@ -13723,10 +13811,10 @@ struct CombineResult { NodeExtensionHelper RHS; CombineResult(unsigned TargetOpcode, SDNode *Root, - const NodeExtensionHelper &LHS, std::optional SExtLHS, - const NodeExtensionHelper &RHS, std::optional SExtRHS) - : TargetOpcode(TargetOpcode), SExtLHS(SExtLHS), SExtRHS(SExtRHS), - Root(Root), LHS(LHS), RHS(RHS) {} + const NodeExtensionHelper &LHS, std::optional LHSExt, + const NodeExtensionHelper &RHS, std::optional RHSExt) + : TargetOpcode(TargetOpcode), LHSExt(LHSExt), RHSExt(RHSExt), Root(Root), + LHS(LHS), RHS(RHS) {} /// Return a value that uses TargetOpcode and that can be used to replace /// Root. @@ -13747,8 +13835,8 @@ struct CombineResult { break; } return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0), - LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS), - RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS), + LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, LHSExt), + RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, RHSExt), Merge, Mask, VL); } }; @@ -13756,7 +13844,7 @@ struct CombineResult { /// Check if \p Root follows a pattern Root(ext(LHS), ext(RHS)) /// where `ext` is the same for both LHS and RHS (i.e., both are sext or both /// are zext) and LHS and RHS can be folded into Root. -/// AllowSExt and AllozZExt define which form `ext` can take in this pattern. +/// AllowExtMask define which form `ext` can take in this pattern. /// /// \note If the pattern can match with both zext and sext, the returned /// CombineResult will feature the zext result. @@ -13765,22 +13853,24 @@ struct CombineResult { /// can be used to apply the pattern. static std::optional canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS, bool AllowSExt, - bool AllowZExt, SelectionDAG &DAG, + const NodeExtensionHelper &RHS, + uint8_t AllowExtMask, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - assert((AllowSExt || AllowZExt) && "Forgot to set what you want?"); if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) return std::nullopt; - if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt) - return CombineResult(NodeExtensionHelper::getSameExtensionOpcode( - Root->getOpcode(), /*IsSExt=*/false), - Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false); - if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt) - return CombineResult(NodeExtensionHelper::getSameExtensionOpcode( - Root->getOpcode(), /*IsSExt=*/true), - Root, LHS, /*SExtLHS=*/true, RHS, - /*SExtRHS=*/true); + if ((AllowExtMask & ExtKind::ZExt) && LHS.SupportsZExt && RHS.SupportsZExt) + return CombineResult(NodeExtensionHelper::getZExtOpcode(Root->getOpcode()), + Root, LHS, /*LHSExt=*/{ExtKind::ZExt}, RHS, + /*RHSExt=*/{ExtKind::ZExt}); + if ((AllowExtMask & ExtKind::SExt) && LHS.SupportsSExt && RHS.SupportsSExt) + return CombineResult(NodeExtensionHelper::getSExtOpcode(Root->getOpcode()), + Root, LHS, /*LHSExt=*/{ExtKind::SExt}, RHS, + /*RHSExt=*/{ExtKind::SExt}); + if ((AllowExtMask & ExtKind::FPExt) && RHS.SupportsFPExt) + return CombineResult(NodeExtensionHelper::getFPExtOpcode(Root->getOpcode()), + Root, LHS, /*LHSExt=*/{ExtKind::FPExt}, RHS, + /*RHSExt=*/{ExtKind::FPExt}); return std::nullopt; } @@ -13794,8 +13884,9 @@ static std::optional canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS, const NodeExtensionHelper &RHS, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, - /*AllowZExt=*/true, DAG, Subtarget); + return canFoldToVWWithSameExtensionImpl( + Root, LHS, RHS, ExtKind::ZExt | ExtKind::SExt | ExtKind::FPExt, DAG, + Subtarget); } /// Check if \p Root follows a pattern Root(LHS, ext(RHS)) @@ -13809,18 +13900,23 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS, if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) return std::nullopt; + if (RHS.SupportsFPExt) + return CombineResult( + NodeExtensionHelper::getWOpcode(Root->getOpcode(), ExtKind::FPExt), + Root, LHS, /*LHSExt=*/std::nullopt, RHS, /*RHSExt=*/{ExtKind::FPExt}); + // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar // sext/zext? // Control this behavior behind an option (AllowSplatInVW_W) for testing // purposes. if (RHS.SupportsZExt && (!RHS.isSplat() || AllowSplatInVW_W)) return CombineResult( - NodeExtensionHelper::getWOpcode(Root->getOpcode(), /*IsSExt=*/false), - Root, LHS, /*SExtLHS=*/std::nullopt, RHS, /*SExtRHS=*/false); + NodeExtensionHelper::getWOpcode(Root->getOpcode(), ExtKind::ZExt), Root, + LHS, /*LHSExt=*/std::nullopt, RHS, /*RHSExt=*/{ExtKind::ZExt}); if (RHS.SupportsSExt && (!RHS.isSplat() || AllowSplatInVW_W)) return CombineResult( - NodeExtensionHelper::getWOpcode(Root->getOpcode(), /*IsSExt=*/true), - Root, LHS, /*SExtLHS=*/std::nullopt, RHS, /*SExtRHS=*/true); + NodeExtensionHelper::getWOpcode(Root->getOpcode(), ExtKind::SExt), Root, + LHS, /*LHSExt=*/std::nullopt, RHS, /*RHSExt=*/{ExtKind::SExt}); return std::nullopt; } @@ -13832,8 +13928,8 @@ static std::optional canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS, const NodeExtensionHelper &RHS, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, - /*AllowZExt=*/false, DAG, Subtarget); + return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, ExtKind::SExt, DAG, + Subtarget); } /// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS)) @@ -13844,8 +13940,20 @@ static std::optional canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS, const NodeExtensionHelper &RHS, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false, - /*AllowZExt=*/true, DAG, Subtarget); + return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, ExtKind::ZExt, DAG, + Subtarget); +} + +/// Check if \p Root follows a pattern Root(fpext(LHS), fpext(RHS)) +/// +/// \returns std::nullopt if the pattern doesn't match or a CombineResult that +/// can be used to apply the pattern. +static std::optional +canFoldToVWWithFPEXT(SDNode *Root, const NodeExtensionHelper &LHS, + const NodeExtensionHelper &RHS, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, ExtKind::FPExt, DAG, + Subtarget); } /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS)) @@ -13863,7 +13971,8 @@ canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS, !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) return std::nullopt; return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()), - Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false); + Root, LHS, /*LHSExt=*/{ExtKind::SExt}, RHS, + /*RHSExt=*/{ExtKind::ZExt}); } SmallVector @@ -13874,11 +13983,16 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { case ISD::SUB: case RISCVISD::ADD_VL: case RISCVISD::SUB_VL: - // add|sub -> vwadd(u)|vwsub(u) + case RISCVISD::FADD_VL: + case RISCVISD::FSUB_VL: + // add|sub|fadd|fsub-> vwadd(u)|vwsub(u)|vfwadd|vfwsub Strategies.push_back(canFoldToVWWithSameExtension); - // add|sub -> vwadd(u)_w|vwsub(u)_w + // add|sub|fadd|fsub -> vwadd(u)_w|vwsub(u)_w}|vfwadd_w|vfwsub_w Strategies.push_back(canFoldToVW_W); break; + case RISCVISD::FMUL_VL: + Strategies.push_back(canFoldToVWWithSameExtension); + break; case ISD::MUL: case RISCVISD::MUL_VL: // mul -> vwmul(u) @@ -13896,6 +14010,11 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { // vwaddu_w|vwsubu_w -> vwaddu|vwsubu Strategies.push_back(canFoldToVWWithZEXT); break; + case RISCVISD::VFWADD_W_VL: + case RISCVISD::VFWSUB_W_VL: + // vfwadd_w|vfwsub_w -> vfwadd|vfwsub + Strategies.push_back(canFoldToVWWithFPEXT); + break; default: llvm_unreachable("Unexpected opcode"); } @@ -13908,8 +14027,13 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { /// add_vl -> vwadd(u) | vwadd(u)_w /// sub_vl -> vwsub(u) | vwsub(u)_w /// mul_vl -> vwmul(u) | vwmul_su +/// fadd_vl -> vfwadd | vfwadd_w +/// fsub_vl -> vfwsub | vfwsub_w +/// fmul_vl -> vfwmul /// vwadd_w(u) -> vwadd(u) -/// vwub_w(u) -> vwadd(u) +/// vwsub_w(u) -> vwsub(u) +/// vfwadd_w -> vfwadd +/// vfwsub_w -> vfwsub static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const RISCVSubtarget &Subtarget) { @@ -13965,9 +14089,9 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, // All the inputs that are extended need to be folded, otherwise // we would be leaving the old input (since it is may still be used), // and the new one. - if (Res->SExtLHS.has_value()) + if (Res->LHSExt.has_value()) AppendUsersIfNeeded(LHS); - if (Res->SExtRHS.has_value()) + if (Res->RHSExt.has_value()) AppendUsersIfNeeded(RHS); break; } @@ -14532,107 +14656,6 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG, N->getOperand(2), Mask, VL); } -static SDValue performVFMUL_VLCombine(SDNode *N, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - if (N->getValueType(0).isScalableVector() && - N->getValueType(0).getVectorElementType() == MVT::f32 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) { - return SDValue(); - } - - // FIXME: Ignore strict opcodes for now. - assert(!N->isTargetStrictFPOpcode() && "Unexpected opcode"); - - // Try to form widening multiply. - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - SDValue Merge = N->getOperand(2); - SDValue Mask = N->getOperand(3); - SDValue VL = N->getOperand(4); - - if (Op0.getOpcode() != RISCVISD::FP_EXTEND_VL || - Op1.getOpcode() != RISCVISD::FP_EXTEND_VL) - return SDValue(); - - // TODO: Refactor to handle more complex cases similar to - // combineBinOp_VLToVWBinOp_VL. - if ((!Op0.hasOneUse() || !Op1.hasOneUse()) && - (Op0 != Op1 || !Op0->hasNUsesOfValue(2, 0))) - return SDValue(); - - // Check the mask and VL are the same. - if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL || - Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL) - return SDValue(); - - Op0 = Op0.getOperand(0); - Op1 = Op1.getOperand(0); - - return DAG.getNode(RISCVISD::VFWMUL_VL, SDLoc(N), N->getValueType(0), Op0, - Op1, Merge, Mask, VL); -} - -static SDValue performFADDSUB_VLCombine(SDNode *N, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - if (N->getValueType(0).isScalableVector() && - N->getValueType(0).getVectorElementType() == MVT::f32 && - (Subtarget.hasVInstructionsF16Minimal() && - !Subtarget.hasVInstructionsF16())) { - return SDValue(); - } - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - SDValue Merge = N->getOperand(2); - SDValue Mask = N->getOperand(3); - SDValue VL = N->getOperand(4); - - bool IsAdd = N->getOpcode() == RISCVISD::FADD_VL; - - // Look for foldable FP_EXTENDS. - bool Op0IsExtend = - Op0.getOpcode() == RISCVISD::FP_EXTEND_VL && - (Op0.hasOneUse() || (Op0 == Op1 && Op0->hasNUsesOfValue(2, 0))); - bool Op1IsExtend = - (Op0 == Op1 && Op0IsExtend) || - (Op1.getOpcode() == RISCVISD::FP_EXTEND_VL && Op1.hasOneUse()); - - // Check the mask and VL. - if (Op0IsExtend && (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL)) - Op0IsExtend = false; - if (Op1IsExtend && (Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)) - Op1IsExtend = false; - - // Canonicalize. - if (!Op1IsExtend) { - // Sub requires at least operand 1 to be an extend. - if (!IsAdd) - return SDValue(); - - // Add is commutable, if the other operand is foldable, swap them. - if (!Op0IsExtend) - return SDValue(); - - std::swap(Op0, Op1); - std::swap(Op0IsExtend, Op1IsExtend); - } - - // Op1 is a foldable extend. Op0 might be foldable. - Op1 = Op1.getOperand(0); - if (Op0IsExtend) - Op0 = Op0.getOperand(0); - - unsigned Opc; - if (IsAdd) - Opc = Op0IsExtend ? RISCVISD::VFWADD_VL : RISCVISD::VFWADD_W_VL; - else - Opc = Op0IsExtend ? RISCVISD::VFWSUB_VL : RISCVISD::VFWSUB_W_VL; - - return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op0, Op1, Merge, Mask, - VL); -} - static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { assert(N->getOpcode() == ISD::SRA && "Unexpected opcode"); @@ -16165,11 +16188,18 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case RISCVISD::STRICT_VFMSUB_VL: case RISCVISD::STRICT_VFNMSUB_VL: return performVFMADD_VLCombine(N, DAG, Subtarget); - case RISCVISD::FMUL_VL: - return performVFMUL_VLCombine(N, DAG, Subtarget); case RISCVISD::FADD_VL: case RISCVISD::FSUB_VL: - return performFADDSUB_VLCombine(N, DAG, Subtarget); + case RISCVISD::FMUL_VL: + case RISCVISD::VFWADD_W_VL: + case RISCVISD::VFWSUB_W_VL: { + if (N->getValueType(0).isScalableVector() && + N->getValueType(0).getVectorElementType() == MVT::f32 && + (Subtarget.hasVInstructionsF16Minimal() && + !Subtarget.hasVInstructionsF16())) + return SDValue(); + return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget); + } case ISD::LOAD: case ISD::STORE: { if (DCI.isAfterLegalizeDAG()) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll new file mode 100644 index 0000000000000..26f77225dbb0e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN +; Check that the default value enables the web folding and +; that it is bigger than 3. +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING + +define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { +; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING-NEXT: vse32.v v10, (a0) +; NO_FOLDING-NEXT: vse32.v v11, (a1) +; NO_FOLDING-NEXT: vse32.v v8, (a2) +; NO_FOLDING-NEXT: ret +; +; ZVFHMIN-LABEL: vfwmul_v2f116_multiple_users: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v10, v11, v8 +; ZVFHMIN-NEXT: vfadd.vv v11, v11, v9 +; ZVFHMIN-NEXT: vfsub.vv v8, v8, v9 +; ZVFHMIN-NEXT: vse32.v v10, (a0) +; ZVFHMIN-NEXT: vse32.v v11, (a1) +; ZVFHMIN-NEXT: vse32.v v8, (a2) +; ZVFHMIN-NEXT: ret + %c = fpext <2 x half> %a to <2 x float> + %d = fpext <2 x half> %b to <2 x float> + %d2 = fpext <2 x half> %b2 to <2 x float> + %e = fmul <2 x float> %c, %d + %f = fadd <2 x float> %c, %d2 + %g = fsub <2 x float> %d, %d2 + store <2 x float> %e, ptr %x + store <2 x float> %f, ptr %y + store <2 x float> %g, ptr %z + ret void +} + +define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) { +; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING: # %bb.0: +; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING-NEXT: vse64.v v10, (a0) +; NO_FOLDING-NEXT: vse64.v v11, (a1) +; NO_FOLDING-NEXT: vse64.v v8, (a2) +; NO_FOLDING-NEXT: ret +; +; FOLDING-LABEL: vfwmul_v2f32_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; FOLDING-NEXT: vfwmul.vv v11, v8, v9 +; FOLDING-NEXT: vfwadd.vv v12, v8, v10 +; FOLDING-NEXT: vfwsub.vv v8, v9, v10 +; FOLDING-NEXT: vse64.v v11, (a0) +; FOLDING-NEXT: vse64.v v12, (a1) +; FOLDING-NEXT: vse64.v v8, (a2) +; FOLDING-NEXT: ret + %c = fpext <2 x float> %a to <2 x double> + %d = fpext <2 x float> %b to <2 x double> + %d2 = fpext <2 x float> %b2 to <2 x double> + %e = fmul <2 x double> %c, %d + %f = fadd <2 x double> %c, %d2 + %g = fsub <2 x double> %d, %d2 + store <2 x double> %e, ptr %x + store <2 x double> %f, ptr %y + store <2 x double> %g, ptr %z + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll index c9dc75e18774f..dd3a50cfd7737 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -396,12 +396,10 @@ define <32 x double> @vfwadd_vf_v32f32(ptr %x, float %y) { ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vslidedown.vi v8, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v8, v16 -; CHECK-NEXT: vfwadd.wv v16, v8, v0 -; CHECK-NEXT: vfwadd.wv v8, v8, v24 +; CHECK-NEXT: vfwadd.vf v16, v8, fa0 +; CHECK-NEXT: vfwadd.vf v8, v24, fa0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index 8ad858d4c7659..7eaa1856ce221 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -394,18 +394,12 @@ define <32 x double> @vfwmul_vf_v32f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v16, 16 +; CHECK-NEXT: vslidedown.vi v8, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v24, v16 -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfmul.vv v16, v24, v0 -; CHECK-NEXT: vfmul.vv v8, v8, v0 +; CHECK-NEXT: vfwmul.vf v16, v8, fa0 +; CHECK-NEXT: vfwmul.vf v8, v24, fa0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll index d22781d6a97ac..8cf7c5f175865 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -394,18 +394,12 @@ define <32 x double> @vfwsub_vf_v32f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v16, 16 +; CHECK-NEXT: vslidedown.vi v8, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v24, v16 -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfsub.vv v16, v24, v0 -; CHECK-NEXT: vfsub.vv v8, v8, v0 +; CHECK-NEXT: vfwsub.vf v16, v8, fa0 +; CHECK-NEXT: vfwsub.vf v8, v24, fa0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 From 6e20cb5524034861d67a1d898907b4755b240f16 Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Wed, 21 Feb 2024 13:06:18 +0000 Subject: [PATCH 088/351] [mlir] fix memory leak Fix a leak of the root operation not being deleted in the recently introduced transform_interpreter.c. --- mlir/test/CAPI/transform_interpreter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/test/CAPI/transform_interpreter.c b/mlir/test/CAPI/transform_interpreter.c index 8fe37b47b7f87..f1ab185e0e214 100644 --- a/mlir/test/CAPI/transform_interpreter.c +++ b/mlir/test/CAPI/transform_interpreter.c @@ -46,6 +46,7 @@ int testApplyNamedSequence(MlirContext ctx) { MlirLogicalResult result = mlirTransformApplyNamedSequence(root, entry, root, options); mlirTransformOptionsDestroy(options); + mlirOperationDestroy(root); if (mlirLogicalResultIsFailure(result)) return 2; From dd3e0a4643670f33850278ad281a358bbdd04e92 Mon Sep 17 00:00:00 2001 From: hev Date: Wed, 21 Feb 2024 21:15:17 +0800 Subject: [PATCH 089/351] [LoongArch] Assume no-op addrspacecasts by default (#82332) This PR indicates that `addrspacecasts` are always no-ops on LoongArch. Fixes #82330 --- .../Target/LoongArch/LoongArchTargetMachine.h | 5 ++ llvm/test/CodeGen/LoongArch/addrspacecast.ll | 47 +++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/addrspacecast.ll diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h index 7d39d47e86b36..fa9bc7608e7d2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.h @@ -45,6 +45,11 @@ class LoongArchTargetMachine : public LLVMTargetMachine { MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; + + // Addrspacecasts are always noops. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + return true; + } }; } // end namespace llvm diff --git a/llvm/test/CodeGen/LoongArch/addrspacecast.ll b/llvm/test/CodeGen/LoongArch/addrspacecast.ll new file mode 100644 index 0000000000000..7875562331be0 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/addrspacecast.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --verify-machineinstrs < %s | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s | FileCheck %s --check-prefix=LA64 + +define void @cast0(ptr addrspace(1) %ptr) { +; LA32-LABEL: cast0: +; LA32: # %bb.0: +; LA32-NEXT: st.w $zero, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: cast0: +; LA64: # %bb.0: +; LA64-NEXT: st.w $zero, $a0, 0 +; LA64-NEXT: ret + %ptr0 = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(0) + store i32 0, ptr %ptr0 + ret void +} + +define void @cast1(ptr %ptr) { +; LA32-LABEL: cast1: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: bl %plt(foo) +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: cast1: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: bl %plt(foo) +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %castptr = addrspacecast ptr %ptr to ptr addrspace(10) + call void @foo(ptr addrspace(10) %castptr) + ret void +} + +declare void @foo(ptr addrspace(10)) From b5437c8ab2af277548ee59b6838e365d35a0d926 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 21 Feb 2024 14:15:39 +0100 Subject: [PATCH 090/351] [clang][Interp] Emit const references for Float arguments (#79753) The Float print type is backed by the Floating class, which in turn uses APFloat, which might heap-allocate memory, so might be expensive to copy. Add an 'AsRef' bit to the ArgType tablegen class, which defines whether we pass the argument around by copy or by reference. --- clang/lib/AST/Interp/Opcodes.td | 8 ++-- clang/utils/TableGen/ClangOpcodesEmitter.cpp | 50 ++++++++++++++++---- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index f1b08944a8812..5add723842d2b 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -35,7 +35,7 @@ def FnPtr : Type; // Types transferred to the interpreter. //===----------------------------------------------------------------------===// -class ArgType { string Name = ?; } +class ArgType { string Name = ?; bit AsRef = false; } def ArgSint8 : ArgType { let Name = "int8_t"; } def ArgUint8 : ArgType { let Name = "uint8_t"; } def ArgSint16 : ArgType { let Name = "int16_t"; } @@ -44,9 +44,9 @@ def ArgSint32 : ArgType { let Name = "int32_t"; } def ArgUint32 : ArgType { let Name = "uint32_t"; } def ArgSint64 : ArgType { let Name = "int64_t"; } def ArgUint64 : ArgType { let Name = "uint64_t"; } -def ArgFloat : ArgType { let Name = "Floating"; } -def ArgIntAP : ArgType { let Name = "IntegralAP"; } -def ArgIntAPS : ArgType { let Name = "IntegralAP"; } +def ArgIntAP : ArgType { let Name = "IntegralAP"; let AsRef = true; } +def ArgIntAPS : ArgType { let Name = "IntegralAP"; let AsRef = true; } +def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; } def ArgBool : ArgType { let Name = "bool"; } def ArgFunction : ArgType { let Name = "const Function *"; } diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp index 02d5f9512d905..1c41301ab3aee 100644 --- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp +++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp @@ -126,9 +126,15 @@ void ClangOpcodesEmitter::EmitInterp(raw_ostream &OS, StringRef N, // Emit calls to read arguments. for (size_t I = 0, N = Args.size(); I < N; ++I) { - OS << " auto V" << I; + const auto *Arg = Args[I]; + bool AsRef = Arg->getValueAsBit("AsRef"); + + if (AsRef) + OS << " const auto &V" << I; + else + OS << " const auto V" << I; OS << " = "; - OS << "ReadArg<" << Args[I]->getValueAsString("Name") + OS << "ReadArg<" << Arg->getValueAsString("Name") << ">(S, PC);\n"; } @@ -192,8 +198,14 @@ void ClangOpcodesEmitter::EmitEmitter(raw_ostream &OS, StringRef N, // Emit the list of arguments. OS << "bool ByteCodeEmitter::emit" << ID << "("; - for (size_t I = 0, N = Args.size(); I < N; ++I) - OS << Args[I]->getValueAsString("Name") << " A" << I << ", "; + for (size_t I = 0, N = Args.size(); I < N; ++I) { + const auto *Arg = Args[I]; + bool AsRef = Arg->getValueAsBit("AsRef"); + auto Name = Arg->getValueAsString("Name"); + + OS << (AsRef ? "const " : " ") << Name << " " << (AsRef ? "&" : "") << "A" + << I << ", "; + } OS << "const SourceInfo &L) {\n"; // Emit a call to write the opcodes. @@ -218,8 +230,14 @@ void ClangOpcodesEmitter::EmitProto(raw_ostream &OS, StringRef N, auto Args = R->getValueAsListOfDefs("Args"); Enumerate(R, N, [&OS, &Args](ArrayRef TS, const Twine &ID) { OS << "bool emit" << ID << "("; - for (auto *Arg : Args) - OS << Arg->getValueAsString("Name") << ", "; + for (size_t I = 0, N = Args.size(); I < N; ++I) { + const auto *Arg = Args[I]; + bool AsRef = Arg->getValueAsBit("AsRef"); + auto Name = Arg->getValueAsString("Name"); + + OS << (AsRef ? "const " : " ") << Name << " " << (AsRef ? "&" : "") + << ", "; + } OS << "const SourceInfo &);\n"; }); @@ -275,8 +293,14 @@ void ClangOpcodesEmitter::EmitGroup(raw_ostream &OS, StringRef N, OS << "::" << EmitFuncName << "("; for (size_t I = 0, N = Types->size(); I < N; ++I) OS << "PrimType T" << I << ", "; - for (size_t I = 0, N = Args.size(); I < N; ++I) - OS << Args[I]->getValueAsString("Name") << " A" << I << ", "; + for (size_t I = 0, N = Args.size(); I < N; ++I) { + const auto *Arg = Args[I]; + bool AsRef = Arg->getValueAsBit("AsRef"); + auto Name = Arg->getValueAsString("Name"); + + OS << (AsRef ? "const " : " ") << Name << " " << (AsRef ? "&" : "") << "A" + << I << ", "; + } OS << "const SourceInfo &I) {\n"; std::function Rec; @@ -343,8 +367,14 @@ void ClangOpcodesEmitter::EmitEval(raw_ostream &OS, StringRef N, auto Args = R->getValueAsListOfDefs("Args"); OS << "bool EvalEmitter::emit" << ID << "("; - for (size_t I = 0, N = Args.size(); I < N; ++I) - OS << Args[I]->getValueAsString("Name") << " A" << I << ", "; + for (size_t I = 0, N = Args.size(); I < N; ++I) { + const auto *Arg = Args[I]; + bool AsRef = Arg->getValueAsBit("AsRef"); + auto Name = Arg->getValueAsString("Name"); + + OS << (AsRef ? "const " : " ") << Name << " " + << (AsRef ? "&" : "") << "A" << I << ", "; + } OS << "const SourceInfo &L) {\n"; OS << " if (!isActive()) return true;\n"; OS << " CurrentSource = L;\n"; From 654e65d3b20835b4959d4d591e179814914ab5e2 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Wed, 21 Feb 2024 13:16:44 +0000 Subject: [PATCH 091/351] [mlir] Apply ClangTidy performance fix. Use const reference for loop variable. --- mlir/lib/Analysis/Presburger/Barvinok.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Analysis/Presburger/Barvinok.cpp b/mlir/lib/Analysis/Presburger/Barvinok.cpp index b6d1f99df8ba5..4be81c25951b8 100644 --- a/mlir/lib/Analysis/Presburger/Barvinok.cpp +++ b/mlir/lib/Analysis/Presburger/Barvinok.cpp @@ -425,7 +425,7 @@ mlir::presburger::detail::computePolytopeGeneratingFunction( // cones. GeneratingFunction vertexGf(numSymbols, {}, {}, {}); SmallVector, 4> unimodCones = {{1, tangentCone}}; - for (std::pair signedCone : unimodCones) { + for (const std::pair &signedCone : unimodCones) { auto [sign, cone] = signedCone; vertexGf = vertexGf + computeUnimodularConeGeneratingFunction(*vertex, sign, cone); @@ -785,4 +785,4 @@ mlir::presburger::detail::computeNumTerms(const GeneratingFunction &gf) { } return totalTerm.simplify(); -} \ No newline at end of file +} From 0fb3d4296f3a3ebe36661643155f4ee35a3167b7 Mon Sep 17 00:00:00 2001 From: chuongg3 Date: Wed, 21 Feb 2024 13:24:45 +0000 Subject: [PATCH 092/351] [AArch64][GlobalISel] Refactor BITCAST Legalization (#80505) Ensure BITCAST is only legal for types with the same amount of bits. Enable BITCAST to work with non-legal vector types as well. --- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 5 + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 54 +++ .../AArch64/GISel/AArch64LegalizerInfo.cpp | 15 +- .../GlobalISel/legalizer-info-validation.mir | 4 +- llvm/test/CodeGen/AArch64/bitcast.ll | 407 ++++++++++-------- 5 files changed, 307 insertions(+), 178 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index f001d8a167297..2beb9919418fc 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -338,6 +338,11 @@ class LegalizerHelper { unsigned TypeIdx, LLT NarrowTy); + // Fewer Elements for bitcast, ensuring that the size of the Src and Dst + // registers will be the same + LegalizeResult fewerElementsBitcast(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy); + LegalizeResult fewerElementsVectorShuffle(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 044cd3d2d426e..30f12bf5cca58 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4677,11 +4677,44 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy); case G_FPOWI: return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/}); + case G_BITCAST: + return fewerElementsBitcast(MI, TypeIdx, NarrowTy); default: return UnableToLegalize; } } +LegalizerHelper::LegalizeResult +LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx, + LLT NarrowTy) { + assert(MI.getOpcode() == TargetOpcode::G_BITCAST && + "Not a bitcast operation"); + + if (TypeIdx != 0) + return UnableToLegalize; + + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + + unsigned SrcScalSize = SrcTy.getScalarSizeInBits(); + LLT SrcNarrowTy = + LLT::fixed_vector(NarrowTy.getSizeInBits() / SrcScalSize, SrcScalSize); + + // Split the Src and Dst Reg into smaller registers + SmallVector SrcVRegs, BitcastVRegs; + if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy) + return UnableToLegalize; + + // Build new smaller bitcast instructions + // Not supporting Leftover types for now but will have to + for (unsigned i = 0; i < SrcVRegs.size(); i++) + BitcastVRegs.push_back( + MIRBuilder.buildBitcast(NarrowTy, SrcVRegs[i]).getReg(0)); + + MIRBuilder.buildMergeLikeInstr(DstReg, BitcastVRegs); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); @@ -5366,6 +5399,27 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_BITCAST: { + if (TypeIdx != 0) + return UnableToLegalize; + + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements(); + if (coefficient % DstTy.getNumElements() != 0) + return UnableToLegalize; + + coefficient = coefficient / DstTy.getNumElements(); + + LLT NewTy = SrcTy.changeElementCount( + ElementCount::get(coefficient, MoreTy.isScalable())); + Observer.changingInstr(MI); + moreElementsVectorSrc(MI, NewTy, 1); + moreElementsVectorDst(MI, MoreTy, 0); + Observer.changedInstr(MI); + return Legalized; + } default: return UnableToLegalize; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 261078cd4bd7d..60e046bc6cf40 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -740,12 +740,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // Casts for 32 and 64-bit width type are just copies. // Same for 128-bit width type, except they are on the FPR bank. getActionDefinitionsBuilder(G_BITCAST) - // FIXME: This is wrong since G_BITCAST is not allowed to change the - // number of bits but it's what the previous code described and fixing - // it breaks tests. - .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, - v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, - v2p0}); + // Keeping 32-bit instructions legal to prevent regression in some tests + .legalForCartesianProduct({s32, v2s16, v4s8}) + .legalForCartesianProduct({s64, v8s8, v4s16, v2s32}) + .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0}) + .moreElementsToNextPow2(0) + .clampNumElements(0, v8s8, v16s8) + .clampNumElements(0, v4s16, v8s16) + .clampNumElements(0, v2s32, v4s32) + .lower(); getActionDefinitionsBuilder(G_VASTART).legalFor({p0}); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 381897b1835de..d87704cf45d5d 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -127,8 +127,8 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # # DEBUG-NEXT: G_BITCAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_FREEZE (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index bac9b48a4087b..a5551285f2788 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -10,15 +10,6 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_i32_v2i16 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v2i16_v4i8 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v4i8_v2i16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v4i64_v8i32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v4i64_v16i16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v8i32_v4i64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v8i32_v16i16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v8i64_v16i32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v16i16_v4i64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v16i16_v8i32 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v16i32_v8i64 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for bitcast_v3i32_v6i16 define <4 x i16> @foo1(<2 x i32> %a) { ; CHECK-SD-LABEL: foo1: @@ -74,9 +65,9 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){ ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret - %c = add <4 x i8> %a, %b - %d = bitcast <4 x i8> %c to i32 - ret i32 %d + %c = add <4 x i8> %a, %b + %d = bitcast <4 x i8> %c to i32 + ret i32 %d } define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){ @@ -86,9 +77,9 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){ ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: ret - %c = add i32 %a, %b - %d = bitcast i32 %c to <4 x i8> - ret <4 x i8> %d + %c = add i32 %a, %b + %d = bitcast i32 %c to <4 x i8> + ret <4 x i8> %d } define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ @@ -104,9 +95,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-NEXT: ldr w0, [sp, #12] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret - %c = add <2 x i16> %a, %b - %d = bitcast <2 x i16> %c to i32 - ret i32 %d + %c = add <2 x i16> %a, %b + %d = bitcast <2 x i16> %c to i32 + ret i32 %d } define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){ @@ -117,9 +108,9 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){ ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret - %c = add i32 %a, %b - %d = bitcast i32 %c to <2 x i16> - ret <2 x i16> %d + %c = add i32 %a, %b + %d = bitcast i32 %c to <2 x i16> + ret <2 x i16> %d } define i64 @bitcast_v8i8_i64(<8 x i8> %a, <8 x i8> %b){ @@ -128,9 +119,9 @@ define i64 @bitcast_v8i8_i64(<8 x i8> %a, <8 x i8> %b){ ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret - %c = add <8 x i8> %a, %b - %d = bitcast <8 x i8> %c to i64 - ret i64 %d + %c = add <8 x i8> %a, %b + %d = bitcast <8 x i8> %c to i64 + ret i64 %d } define <8 x i8> @bitcast_i64_v8i8(i64 %a, i64 %b){ @@ -139,9 +130,9 @@ define <8 x i8> @bitcast_i64_v8i8(i64 %a, i64 %b){ ; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret - %c = add i64 %a, %b - %d = bitcast i64 %c to <8 x i8> - ret <8 x i8> %d + %c = add i64 %a, %b + %d = bitcast i64 %c to <8 x i8> + ret <8 x i8> %d } define i64 @bitcast_v4i16_i64(<4 x i16> %a, <4 x i16> %b){ @@ -150,9 +141,9 @@ define i64 @bitcast_v4i16_i64(<4 x i16> %a, <4 x i16> %b){ ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret - %c = add <4 x i16> %a, %b - %d = bitcast <4 x i16> %c to i64 - ret i64 %d + %c = add <4 x i16> %a, %b + %d = bitcast <4 x i16> %c to i64 + ret i64 %d } define <4 x i16> @bitcast_i64_v4i16(i64 %a, i64 %b){ @@ -161,9 +152,9 @@ define <4 x i16> @bitcast_i64_v4i16(i64 %a, i64 %b){ ; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret - %c = add i64 %a, %b - %d = bitcast i64 %c to <4 x i16> - ret <4 x i16> %d + %c = add i64 %a, %b + %d = bitcast i64 %c to <4 x i16> + ret <4 x i16> %d } define i64 @bitcast_v2i32_i64(<2 x i32> %a, <2 x i32> %b){ @@ -172,9 +163,9 @@ define i64 @bitcast_v2i32_i64(<2 x i32> %a, <2 x i32> %b){ ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret - %c = add <2 x i32> %a, %b - %d = bitcast <2 x i32> %c to i64 - ret i64 %d + %c = add <2 x i32> %a, %b + %d = bitcast <2 x i32> %c to i64 + ret i64 %d } define <2 x i32> @bitcast_i64_v2i32(i64 %a, i64 %b){ @@ -183,9 +174,9 @@ define <2 x i32> @bitcast_i64_v2i32(i64 %a, i64 %b){ ; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret - %c = add i64 %a, %b - %d = bitcast i64 %c to <2 x i32> - ret <2 x i32> %d + %c = add i64 %a, %b + %d = bitcast i64 %c to <2 x i32> + ret <2 x i32> %d } ; ===== Legal Vector Types ===== @@ -195,9 +186,9 @@ define <4 x i16> @bitcast_v2i32_v4i16(<2 x i32> %a, <2 x i32> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret - %c = add <2 x i32> %a, %b - %d = bitcast <2 x i32> %c to <4 x i16> - ret <4 x i16> %d + %c = add <2 x i32> %a, %b + %d = bitcast <2 x i32> %c to <4 x i16> + ret <4 x i16> %d } define <4 x i32> @bitcast_v2i64_v4i32(<2 x i64> %a, <2 x i64> %b){ @@ -205,9 +196,9 @@ define <4 x i32> @bitcast_v2i64_v4i32(<2 x i64> %a, <2 x i64> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret - %c = add <2 x i64> %a, %b - %d = bitcast <2 x i64> %c to <4 x i32> - ret <4 x i32> %d + %c = add <2 x i64> %a, %b + %d = bitcast <2 x i64> %c to <4 x i32> + ret <4 x i32> %d } define <8 x i8> @bitcast_v2i32_v8i8(<2 x i32> %a, <2 x i32> %b){ @@ -215,9 +206,9 @@ define <8 x i8> @bitcast_v2i32_v8i8(<2 x i32> %a, <2 x i32> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret - %c = add <2 x i32> %a, %b - %d = bitcast <2 x i32> %c to <8 x i8> - ret <8 x i8> %d + %c = add <2 x i32> %a, %b + %d = bitcast <2 x i32> %c to <8 x i8> + ret <8 x i8> %d } define <8 x i16> @bitcast_v2i64_v8i16(<2 x i64> %a, <2 x i64> %b){ @@ -225,9 +216,9 @@ define <8 x i16> @bitcast_v2i64_v8i16(<2 x i64> %a, <2 x i64> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret - %c = add <2 x i64> %a, %b - %d = bitcast <2 x i64> %c to <8 x i16> - ret <8 x i16> %d + %c = add <2 x i64> %a, %b + %d = bitcast <2 x i64> %c to <8 x i16> + ret <8 x i16> %d } define <16 x i8> @bitcast_v2i64_v16i8(<2 x i64> %a, <2 x i64> %b){ @@ -235,9 +226,9 @@ define <16 x i8> @bitcast_v2i64_v16i8(<2 x i64> %a, <2 x i64> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret - %c = add <2 x i64> %a, %b - %d = bitcast <2 x i64> %c to <16 x i8> - ret <16 x i8> %d + %c = add <2 x i64> %a, %b + %d = bitcast <2 x i64> %c to <16 x i8> + ret <16 x i8> %d } define <2 x i32> @bitcast_v4i16_v2i32(<4 x i16> %a, <4 x i16> %b){ @@ -245,9 +236,9 @@ define <2 x i32> @bitcast_v4i16_v2i32(<4 x i16> %a, <4 x i16> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret - %c = add <4 x i16> %a, %b - %d = bitcast <4 x i16> %c to <2 x i32> - ret <2 x i32> %d + %c = add <4 x i16> %a, %b + %d = bitcast <4 x i16> %c to <2 x i32> + ret <2 x i32> %d } define <2 x i64> @bitcast_v4i32_v2i64(<4 x i32> %a, <4 x i32> %b){ @@ -255,9 +246,9 @@ define <2 x i64> @bitcast_v4i32_v2i64(<4 x i32> %a, <4 x i32> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret - %c = add <4 x i32> %a, %b - %d = bitcast <4 x i32> %c to <2 x i64> - ret <2 x i64> %d + %c = add <4 x i32> %a, %b + %d = bitcast <4 x i32> %c to <2 x i64> + ret <2 x i64> %d } define <8 x i8> @bitcast_v4i16_v8i8(<4 x i16> %a, <4 x i16> %b){ @@ -265,9 +256,9 @@ define <8 x i8> @bitcast_v4i16_v8i8(<4 x i16> %a, <4 x i16> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret - %c = add <4 x i16> %a, %b - %d = bitcast <4 x i16> %c to <8 x i8> - ret <8 x i8> %d + %c = add <4 x i16> %a, %b + %d = bitcast <4 x i16> %c to <8 x i8> + ret <8 x i8> %d } define <8 x i16> @bitcast_v4i32_v8i16(<4 x i32> %a, <4 x i32> %b){ @@ -275,9 +266,9 @@ define <8 x i16> @bitcast_v4i32_v8i16(<4 x i32> %a, <4 x i32> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret - %c = add <4 x i32> %a, %b - %d = bitcast <4 x i32> %c to <8 x i16> - ret <8 x i16> %d + %c = add <4 x i32> %a, %b + %d = bitcast <4 x i32> %c to <8 x i16> + ret <8 x i16> %d } define <16 x i8> @bitcast_v4i32_v16i8(<4 x i32> %a, <4 x i32> %b){ @@ -285,9 +276,9 @@ define <16 x i8> @bitcast_v4i32_v16i8(<4 x i32> %a, <4 x i32> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret - %c = add <4 x i32> %a, %b - %d = bitcast <4 x i32> %c to <16 x i8> - ret <16 x i8> %d + %c = add <4 x i32> %a, %b + %d = bitcast <4 x i32> %c to <16 x i8> + ret <16 x i8> %d } define <2 x i32> @bitcast_v8i8_v2i32(<8 x i8> %a, <8 x i8> %b){ @@ -295,9 +286,9 @@ define <2 x i32> @bitcast_v8i8_v2i32(<8 x i8> %a, <8 x i8> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %c = add <8 x i8> %a, %b - %d = bitcast <8 x i8> %c to <2 x i32> - ret <2 x i32> %d + %c = add <8 x i8> %a, %b + %d = bitcast <8 x i8> %c to <2 x i32> + ret <2 x i32> %d } define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a, <8 x i16> %b){ @@ -305,9 +296,9 @@ define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a, <8 x i16> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret - %c = add <8 x i16> %a, %b - %d = bitcast <8 x i16> %c to <2 x i64> - ret <2 x i64> %d + %c = add <8 x i16> %a, %b + %d = bitcast <8 x i16> %c to <2 x i64> + ret <2 x i64> %d } define <4 x i16> @bitcast_v8i8_v4i16(<8 x i8> %a, <8 x i8> %b){ @@ -315,9 +306,9 @@ define <4 x i16> @bitcast_v8i8_v4i16(<8 x i8> %a, <8 x i8> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret - %c = add <8 x i8> %a, %b - %d = bitcast <8 x i8> %c to <4 x i16> - ret <4 x i16> %d + %c = add <8 x i8> %a, %b + %d = bitcast <8 x i8> %c to <4 x i16> + ret <4 x i16> %d } define <4 x i32> @bitcast_v8i16_v4i32(<8 x i16> %a, <8 x i16> %b){ @@ -325,9 +316,9 @@ define <4 x i32> @bitcast_v8i16_v4i32(<8 x i16> %a, <8 x i16> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret - %c = add <8 x i16> %a, %b - %d = bitcast <8 x i16> %c to <4 x i32> - ret <4 x i32> %d + %c = add <8 x i16> %a, %b + %d = bitcast <8 x i16> %c to <4 x i32> + ret <4 x i32> %d } define <16 x i8> @bitcast_v8i16_v16i8(<8 x i16> %a, <8 x i16> %b){ @@ -335,9 +326,9 @@ define <16 x i8> @bitcast_v8i16_v16i8(<8 x i16> %a, <8 x i16> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret - %c = add <8 x i16> %a, %b - %d = bitcast <8 x i16> %c to <16 x i8> - ret <16 x i8> %d + %c = add <8 x i16> %a, %b + %d = bitcast <8 x i16> %c to <16 x i8> + ret <16 x i8> %d } define <2 x i64> @bitcast_v16i8_v2i64(<16 x i8> %a, <16 x i8> %b){ @@ -345,9 +336,9 @@ define <2 x i64> @bitcast_v16i8_v2i64(<16 x i8> %a, <16 x i8> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %c = add <16 x i8> %a, %b - %d = bitcast <16 x i8> %c to <2 x i64> - ret <2 x i64> %d + %c = add <16 x i8> %a, %b + %d = bitcast <16 x i8> %c to <2 x i64> + ret <2 x i64> %d } define <4 x i32> @bitcast_v16i8_v4i32(<16 x i8> %a, <16 x i8> %b){ @@ -355,9 +346,9 @@ define <4 x i32> @bitcast_v16i8_v4i32(<16 x i8> %a, <16 x i8> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %c = add <16 x i8> %a, %b - %d = bitcast <16 x i8> %c to <4 x i32> - ret <4 x i32> %d + %c = add <16 x i8> %a, %b + %d = bitcast <16 x i8> %c to <4 x i32> + ret <4 x i32> %d } define <8 x i16> @bitcast_v16i8_v8i16(<16 x i8> %a, <16 x i8> %b){ @@ -365,9 +356,9 @@ define <8 x i16> @bitcast_v16i8_v8i16(<16 x i8> %a, <16 x i8> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret - %c = add <16 x i8> %a, %b - %d = bitcast <16 x i8> %c to <8 x i16> - ret <8 x i16> %d + %c = add <16 x i8> %a, %b + %d = bitcast <16 x i8> %c to <8 x i16> + ret <8 x i16> %d } ; ===== Smaller/Larger Width Vectors with Legal Element Sizes ===== @@ -387,9 +378,9 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret - %c = add <2 x i16> %a, %b - %d = bitcast <2 x i16> %c to <4 x i8> - ret <4 x i8> %d + %c = add <2 x i16> %a, %b + %d = bitcast <2 x i16> %c to <4 x i8> + ret <4 x i8> %d } define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ @@ -407,101 +398,177 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret - %c = add <4 x i8> %a, %b - %d = bitcast <4 x i8> %c to <2 x i16> - ret <2 x i16> %d + %c = add <4 x i8> %a, %b + %d = bitcast <4 x i8> %c to <2 x i16> + ret <2 x i16> %d } define <8 x i32> @bitcast_v4i64_v8i32(<4 x i64> %a, <4 x i64> %b){ -; CHECK-LABEL: bitcast_v4i64_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: add v1.2d, v1.2d, v3.2d -; CHECK-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-NEXT: ret - %c = add <4 x i64> %a, %b - %d = bitcast <4 x i64> %c to <8 x i32> - ret <8 x i32> %d +; CHECK-SD-LABEL: bitcast_v4i64_v8i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v4i64_v8i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret + %c = add <4 x i64> %a, %b + %d = bitcast <4 x i64> %c to <8 x i32> + ret <8 x i32> %d } define <16 x i16> @bitcast_v4i64_v16i16(<4 x i64> %a, <4 x i64> %b){ -; CHECK-LABEL: bitcast_v4i64_v16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: add v1.2d, v1.2d, v3.2d -; CHECK-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-NEXT: ret - %c = add <4 x i64> %a, %b - %d = bitcast <4 x i64> %c to <16 x i16> - ret <16 x i16> %d +; CHECK-SD-LABEL: bitcast_v4i64_v16i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v4i64_v16i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d +; CHECK-GI-NEXT: ret + %c = add <4 x i64> %a, %b + %d = bitcast <4 x i64> %c to <16 x i16> + ret <16 x i16> %d } define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){ -; CHECK-LABEL: bitcast_v8i32_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ret - %c = add <8 x i32> %a, %b - %d = bitcast <8 x i32> %c to <4 x i64> - ret <4 x i64> %d +; CHECK-SD-LABEL: bitcast_v8i32_v4i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v8i32_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov v1.d[1], x9 +; CHECK-GI-NEXT: ret + %c = add <8 x i32> %a, %b + %d = bitcast <8 x i32> %c to <4 x i64> + ret <4 x i64> %d } define <16 x i16> @bitcast_v8i32_v16i16(<8 x i32> %a, <8 x i32> %b){ -; CHECK-LABEL: bitcast_v8i32_v16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ret - %c = add <8 x i32> %a, %b - %d = bitcast <8 x i32> %c to <16 x i16> - ret <16 x i16> %d +; CHECK-SD-LABEL: bitcast_v8i32_v16i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v8i32_v16i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: ret + %c = add <8 x i32> %a, %b + %d = bitcast <8 x i32> %c to <16 x i16> + ret <16 x i16> %d } define <16 x i32> @bitcast_v8i64_v16i32(<8 x i64> %a, <8 x i64> %b){ -; CHECK-LABEL: bitcast_v8i64_v16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: add v2.2d, v2.2d, v6.2d -; CHECK-NEXT: add v0.2d, v0.2d, v4.2d -; CHECK-NEXT: add v1.2d, v1.2d, v5.2d -; CHECK-NEXT: add v3.2d, v3.2d, v7.2d -; CHECK-NEXT: ret - %c = add <8 x i64> %a, %b - %d = bitcast <8 x i64> %c to <16 x i32> - ret <16 x i32> %d +; CHECK-SD-LABEL: bitcast_v8i64_v16i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v2.2d, v2.2d, v6.2d +; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v5.2d +; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v8i64_v16i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.2d, v0.2d, v4.2d +; CHECK-GI-NEXT: add v1.2d, v1.2d, v5.2d +; CHECK-GI-NEXT: add v2.2d, v2.2d, v6.2d +; CHECK-GI-NEXT: add v3.2d, v3.2d, v7.2d +; CHECK-GI-NEXT: ret + %c = add <8 x i64> %a, %b + %d = bitcast <8 x i64> %c to <16 x i32> + ret <16 x i32> %d } define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){ -; CHECK-LABEL: bitcast_v16i16_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-NEXT: add v0.8h, v0.8h, v2.8h -; CHECK-NEXT: ret - %c = add <16 x i16> %a, %b - %d = bitcast <16 x i16> %c to <4 x i64> - ret <4 x i64> %d +; CHECK-SD-LABEL: bitcast_v16i16_v4i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v16i16_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: fmov x9, d3 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov v1.d[1], x9 +; CHECK-GI-NEXT: ret + %c = add <16 x i16> %a, %b + %d = bitcast <16 x i16> %c to <4 x i64> + ret <4 x i64> %d } define <8 x i32> @bitcast_v16i16_v8i32(<16 x i16> %a, <16 x i16> %b){ -; CHECK-LABEL: bitcast_v16i16_v8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: add v1.8h, v1.8h, v3.8h -; CHECK-NEXT: add v0.8h, v0.8h, v2.8h -; CHECK-NEXT: ret - %c = add <16 x i16> %a, %b - %d = bitcast <16 x i16> %c to <8 x i32> - ret <8 x i32> %d +; CHECK-SD-LABEL: bitcast_v16i16_v8i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v16i16_v8i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: add v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: ret + %c = add <16 x i16> %a, %b + %d = bitcast <16 x i16> %c to <8 x i32> + ret <8 x i32> %d } define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){ -; CHECK-LABEL: bitcast_v16i32_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: add v2.4s, v2.4s, v6.4s -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: add v3.4s, v3.4s, v7.4s -; CHECK-NEXT: ret - %c = add <16 x i32> %a, %b - %d = bitcast <16 x i32> %c to <8 x i64> - ret <8 x i64> %d +; CHECK-SD-LABEL: bitcast_v16i32_v8i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-SD-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-SD-NEXT: add v3.4s, v3.4s, v7.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bitcast_v16i32_v8i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-GI-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-GI-NEXT: add v3.4s, v3.4s, v7.4s +; CHECK-GI-NEXT: mov d4, v0.d[1] +; CHECK-GI-NEXT: mov d5, v1.d[1] +; CHECK-GI-NEXT: mov d6, v2.d[1] +; CHECK-GI-NEXT: mov d7, v3.d[1] +; CHECK-GI-NEXT: fmov x8, d4 +; CHECK-GI-NEXT: fmov x9, d5 +; CHECK-GI-NEXT: fmov x10, d6 +; CHECK-GI-NEXT: fmov x11, d7 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov v1.d[1], x9 +; CHECK-GI-NEXT: mov v2.d[1], x10 +; CHECK-GI-NEXT: mov v3.d[1], x11 +; CHECK-GI-NEXT: ret + %c = add <16 x i32> %a, %b + %d = bitcast <16 x i32> %c to <8 x i64> + ret <8 x i64> %d } ; ===== Vectors with Non-Pow 2 Widths ===== @@ -511,7 +578,7 @@ define <6 x i16> @bitcast_v3i32_v6i16(<3 x i32> %a, <3 x i32> %b){ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret - %c = add <3 x i32> %a, %b - %d = bitcast <3 x i32> %c to <6 x i16> - ret <6 x i16> %d + %c = add <3 x i32> %a, %b + %d = bitcast <3 x i32> %c to <6 x i16> + ret <6 x i16> %d } From 69279a8413e08dd24168bad961975e79a50d9c19 Mon Sep 17 00:00:00 2001 From: Hui Date: Wed, 21 Feb 2024 13:43:35 +0000 Subject: [PATCH 093/351] [libc++][test] add benchmarks for `std::atomic::wait` (#70571) For the mutex vs atomic test: Old: `unique_lock` New: a lock implemented with `atomic::wait` On 10 years old Intel Macbook, `atomic::wait` is 50% slower than `mutex` ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------- BM_multi_thread_lock_unlock/1024 +0.3735 +2.4497 1724726 2368935 153159 528354 BM_multi_thread_lock_unlock/2048 +0.4174 +1.2487 3410538 4834012 435062 978311 BM_multi_thread_lock_unlock/4096 +0.5256 +1.9824 6903783 10532681 590266 1760405 BM_multi_thread_lock_unlock/8192 +0.5415 +0.4578 14536391 22408399 1456328 2123075 BM_multi_thread_lock_unlock/16384 +0.5663 +0.0513 30181991 47275023 3316850 3486950 BM_multi_thread_lock_unlock/32768 +0.5635 -0.2081 62027663 96977726 6477076 5129190 BM_multi_thread_lock_unlock/65536 +0.5228 -0.3273 129637761 197408739 11341630 7628955 BM_multi_thread_lock_unlock/131072 +0.4825 -0.1070 266256295 394712193 10379800 9269200 BM_multi_thread_lock_unlock/262144 +0.4793 +0.2795 539732340 798409253 10802200 13821100 BM_multi_thread_lock_unlock/524288 +0.5272 +0.2847 1070035132 1634124353 14523000 18657800 BM_multi_thread_lock_unlock/1048576 +0.4799 +0.3353 2125510441 3145636119 13404200 17899000 OVERALL_GEOMEAN +0.4970 +0.3886 0 0 0 0 ``` On Apple Arm, `atomic::wait` is 200% slower than `mutex`. And `atomic::wait` is even slower than my 10 years old Intel CPU Macbook ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------- BM_multi_thread_lock_unlock/1024 +2.1811 +3.9854 2036726 6478993 119817 597334 BM_multi_thread_lock_unlock/2048 +1.6736 +1.4301 3162161 8454415 426201 1035727 BM_multi_thread_lock_unlock/4096 +1.1017 +0.6456 6620503 13914159 893019 1469578 BM_multi_thread_lock_unlock/8192 +0.6688 +0.2148 12089392 20174635 1489000 1808799 BM_multi_thread_lock_unlock/16384 +1.4217 -0.2436 19365999 46899345 2068266 1564530 BM_multi_thread_lock_unlock/32768 +2.6161 -0.4927 31371052 113440165 3715100 1884540 BM_multi_thread_lock_unlock/65536 +2.6286 -0.3967 54314581 197086847 5912764 3567410 BM_multi_thread_lock_unlock/131072 +2.3554 +0.4990 103176565 346201425 9260407 13880900 BM_multi_thread_lock_unlock/262144 +2.8780 +0.4995 182355400 707170733 16335852 24496000 BM_multi_thread_lock_unlock/524288 +3.0280 +0.3001 360953079 1453902595 32548700 42316364 BM_multi_thread_lock_unlock/1048576 +3.7480 +1.2374 714500462 3392470417 48603455 108747000 OVERALL_GEOMEAN +2.0791 +0.3874 0 0 0 0 ``` For the atomic_wait test: On my 2013 MacBook with Intel CPU ``` Run on (8 X 2300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x4) L1 Instruction 32 KiB (x4) L2 Unified 256 KiB (x4) L3 Unified 6144 KiB (x1) Load Average: 1.95, 3.77, 4.13 ----------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------------------- BM_atomic_wait_one_thread_one_atomic_wait/1024 184455 ns 183979 ns 3760 BM_atomic_wait_one_thread_one_atomic_wait/2048 361607 ns 360917 ns 1912 BM_atomic_wait_one_thread_one_atomic_wait/4096 709055 ns 708326 ns 929 BM_atomic_wait_one_thread_one_atomic_wait/8192 1469063 ns 1467430 ns 488 BM_atomic_wait_one_thread_one_atomic_wait/16384 2865332 ns 2863473 ns 237 BM_atomic_wait_one_thread_one_atomic_wait/32768 5839429 ns 5834708 ns 113 BM_atomic_wait_one_thread_one_atomic_wait/65536 11460822 ns 11453183 ns 60 BM_atomic_wait_one_thread_one_atomic_wait/131072 23052804 ns 23035000 ns 30 BM_atomic_wait_one_thread_one_atomic_wait/262144 46958743 ns 46712733 ns 15 BM_atomic_wait_one_thread_one_atomic_wait/524288 93151904 ns 92977429 ns 7 BM_atomic_wait_one_thread_one_atomic_wait/1048576 186100011 ns 185888500 ns 4 BM_atomic_wait_one_thread_one_atomic_wait/2097152 364548135 ns 364280000 ns 2 BM_atomic_wait_one_thread_one_atomic_wait/4194304 747181672 ns 745056000 ns 1 BM_atomic_wait_one_thread_one_atomic_wait/8388608 1473070400 ns 1471165000 ns 1 BM_atomic_wait_one_thread_one_atomic_wait/16777216 2950352547 ns 2947373000 ns 1 BM_atomic_wait_multi_thread_one_atomic_wait/1024 668544 ns 167233 ns 4496 BM_atomic_wait_multi_thread_one_atomic_wait/2048 1384668 ns 369750 ns 1941 BM_atomic_wait_multi_thread_one_atomic_wait/4096 2851627 ns 768559 ns 995 BM_atomic_wait_multi_thread_one_atomic_wait/8192 5797669 ns 1476876 ns 526 BM_atomic_wait_multi_thread_one_atomic_wait/16384 11597952 ns 2692792 ns 260 BM_atomic_wait_multi_thread_one_atomic_wait/32768 23528028 ns 5291465 ns 142 BM_atomic_wait_multi_thread_one_atomic_wait/65536 46287247 ns 8547713 ns 87 BM_atomic_wait_multi_thread_one_atomic_wait/131072 90315848 ns 13294492 ns 61 BM_atomic_wait_multi_thread_one_atomic_wait/262144 190722393 ns 16193917 ns 36 BM_atomic_wait_multi_thread_one_atomic_wait/524288 408456684 ns 23641600 ns 10 BM_atomic_wait_multi_thread_one_atomic_wait/1048576 708809670 ns 36361900 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1024 2116444 ns 11669 ns 10000 BM_atomic_wait_multi_thread_wait_different_atomics/2048 12435259 ns 21905 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/4096 6393816 ns 17819 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/8192 11930400 ns 28637 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/16384 20987224 ns 35272 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/32768 44335820 ns 66660 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/65536 91395912 ns 129030 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/131072 145440007 ns 165960 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/262144 368219935 ns 420800 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/524288 630106863 ns 809500 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1048576 1138174673 ns 1093000 ns 10 ``` On apple arm ``` Run on (8 X 24.1208 MHz CPU s) CPU Caches: L1 Data 64 KiB (x8) L1 Instruction 128 KiB (x8) L2 Unified 4096 KiB (x2) Load Average: 1.34, 1.58, 1.66 ----------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------------------- BM_atomic_wait_one_thread_one_atomic_wait/1024 61602 ns 61602 ns 8701 BM_atomic_wait_one_thread_one_atomic_wait/2048 123148 ns 123146 ns 5688 BM_atomic_wait_one_thread_one_atomic_wait/4096 246248 ns 246249 ns 2888 BM_atomic_wait_one_thread_one_atomic_wait/8192 480373 ns 480359 ns 1455 BM_atomic_wait_one_thread_one_atomic_wait/16384 974725 ns 974721 ns 724 BM_atomic_wait_one_thread_one_atomic_wait/32768 1922185 ns 1922115 ns 355 BM_atomic_wait_one_thread_one_atomic_wait/65536 3940632 ns 3940608 ns 181 BM_atomic_wait_one_thread_one_atomic_wait/131072 7886302 ns 7886102 ns 88 BM_atomic_wait_one_thread_one_atomic_wait/262144 15393156 ns 15393000 ns 45 BM_atomic_wait_one_thread_one_atomic_wait/524288 30833221 ns 30832174 ns 23 BM_atomic_wait_one_thread_one_atomic_wait/1048576 62551936 ns 62551909 ns 11 BM_atomic_wait_one_thread_one_atomic_wait/2097152 123155625 ns 123155667 ns 6 BM_atomic_wait_one_thread_one_atomic_wait/4194304 252468180 ns 252458667 ns 3 BM_atomic_wait_one_thread_one_atomic_wait/8388608 505075604 ns 505075500 ns 2 BM_atomic_wait_one_thread_one_atomic_wait/16777216 992977209 ns 992935000 ns 1 BM_atomic_wait_multi_thread_one_atomic_wait/1024 531411 ns 239695 ns 2783 BM_atomic_wait_multi_thread_one_atomic_wait/2048 1030592 ns 484868 ns 1413 BM_atomic_wait_multi_thread_one_atomic_wait/4096 1951896 ns 922357 ns 631 BM_atomic_wait_multi_thread_one_atomic_wait/8192 3759893 ns 1952074 ns 390 BM_atomic_wait_multi_thread_one_atomic_wait/16384 7417929 ns 3458309 ns 233 BM_atomic_wait_multi_thread_one_atomic_wait/32768 14386361 ns 5590830 ns 100 BM_atomic_wait_multi_thread_one_atomic_wait/65536 29725536 ns 6521887 ns 115 BM_atomic_wait_multi_thread_one_atomic_wait/131072 60023797 ns 10766795 ns 73 BM_atomic_wait_multi_thread_one_atomic_wait/262144 120782267 ns 17532091 ns 44 BM_atomic_wait_multi_thread_one_atomic_wait/524288 242539333 ns 27506920 ns 25 BM_atomic_wait_multi_thread_one_atomic_wait/1048576 482833787 ns 53721600 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1024 2230048 ns 626042 ns 1000 BM_atomic_wait_multi_thread_wait_different_atomics/2048 3931958 ns 837540 ns 884 BM_atomic_wait_multi_thread_wait_different_atomics/4096 6506887 ns 1127922 ns 586 BM_atomic_wait_multi_thread_wait_different_atomics/8192 10528008 ns 1651254 ns 456 BM_atomic_wait_multi_thread_wait_different_atomics/16384 18055829 ns 2066379 ns 317 BM_atomic_wait_multi_thread_wait_different_atomics/32768 29878496 ns 2875600 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/65536 50523799 ns 3193170 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/131072 85926943 ns 4121950 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/262144 154602296 ns 5879050 ns 100 BM_atomic_wait_multi_thread_wait_different_atomics/524288 279121754 ns 10063400 ns 10 BM_atomic_wait_multi_thread_wait_different_atomics/1048576 522796900 ns 12370300 ns 10 ``` --- libcxx/benchmarks/CMakeLists.txt | 2 + libcxx/benchmarks/atomic_wait.bench.cpp | 154 ++++++++++++++++++ .../atomic_wait_vs_mutex_lock.bench.cpp | 109 +++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 libcxx/benchmarks/atomic_wait.bench.cpp create mode 100644 libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt index 2434d82c6fd6b..b436e96f178b7 100644 --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -197,6 +197,8 @@ set(BENCHMARK_TESTS algorithms/sort.bench.cpp algorithms/sort_heap.bench.cpp algorithms/stable_sort.bench.cpp + atomic_wait.bench.cpp + atomic_wait_vs_mutex_lock.bench.cpp libcxxabi/dynamic_cast.bench.cpp libcxxabi/dynamic_cast_old_stress.bench.cpp allocation.bench.cpp diff --git a/libcxx/benchmarks/atomic_wait.bench.cpp b/libcxx/benchmarks/atomic_wait.bench.cpp new file mode 100644 index 0000000000000..4a06a45739377 --- /dev/null +++ b/libcxx/benchmarks/atomic_wait.bench.cpp @@ -0,0 +1,154 @@ +//===----------------------------------------------------------------------===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "make_test_thread.h" + +using namespace std::chrono_literals; + +void BM_atomic_wait_one_thread_one_atomic_wait(benchmark::State& state) { + std::atomic a; + auto thread_func = [&](std::stop_token st) { + while (!st.stop_requested()) { + a.fetch_add(1, std::memory_order_relaxed); + a.notify_all(); + } + }; + + std::uint64_t total_loop_test_param = state.range(0); + + auto thread = support::make_test_jthread(thread_func); + + for (auto _ : state) { + for (std::uint64_t i = 0; i < total_loop_test_param; ++i) { + auto old = a.load(std::memory_order_relaxed); + a.wait(old); + } + } +} +BENCHMARK(BM_atomic_wait_one_thread_one_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 24); + +void BM_atomic_wait_multi_thread_one_atomic_wait(benchmark::State& state) { + std::atomic a; + auto notify_func = [&](std::stop_token st) { + while (!st.stop_requested()) { + a.fetch_add(1, std::memory_order_relaxed); + a.notify_all(); + } + }; + + std::uint64_t total_loop_test_param = state.range(0); + constexpr auto num_waiting_threads = 15; + std::vector wait_threads; + wait_threads.reserve(num_waiting_threads); + + auto notify_thread = support::make_test_jthread(notify_func); + + std::atomic start_flag = 0; + std::atomic done_count = 0; + auto wait_func = [&a, &start_flag, &done_count, total_loop_test_param](std::stop_token st) { + auto old_start = 0; + while (!st.stop_requested()) { + start_flag.wait(old_start); + old_start = start_flag.load(); + for (std::uint64_t i = 0; i < total_loop_test_param; ++i) { + auto old = a.load(std::memory_order_relaxed); + a.wait(old); + } + done_count.fetch_add(1); + } + }; + + for (size_t i = 0; i < num_waiting_threads; ++i) { + wait_threads.emplace_back(support::make_test_jthread(wait_func)); + } + + for (auto _ : state) { + done_count = 0; + start_flag.fetch_add(1); + start_flag.notify_all(); + while (done_count < num_waiting_threads) { + std::this_thread::yield(); + } + } + for (auto& t : wait_threads) { + t.request_stop(); + } + start_flag.fetch_add(1); + start_flag.notify_all(); + for (auto& t : wait_threads) { + t.join(); + } +} +BENCHMARK(BM_atomic_wait_multi_thread_one_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 20); + +void BM_atomic_wait_multi_thread_wait_different_atomics(benchmark::State& state) { + const std::uint64_t total_loop_test_param = state.range(0); + constexpr std::uint64_t num_atomics = 7; + std::vector> atomics(num_atomics); + + auto notify_func = [&](std::stop_token st, size_t idx) { + while (!st.stop_requested()) { + atomics[idx].fetch_add(1, std::memory_order_relaxed); + atomics[idx].notify_all(); + } + }; + + std::atomic start_flag = 0; + std::atomic done_count = 0; + + auto wait_func = [&, total_loop_test_param](std::stop_token st, size_t idx) { + auto old_start = 0; + while (!st.stop_requested()) { + start_flag.wait(old_start); + old_start = start_flag.load(); + for (std::uint64_t i = 0; i < total_loop_test_param; ++i) { + auto old = atomics[idx].load(std::memory_order_relaxed); + atomics[idx].wait(old); + } + done_count.fetch_add(1); + } + }; + + std::vector notify_threads; + notify_threads.reserve(num_atomics); + + std::vector wait_threads; + wait_threads.reserve(num_atomics); + + for (size_t i = 0; i < num_atomics; ++i) { + notify_threads.emplace_back(support::make_test_jthread(notify_func, i)); + } + + for (size_t i = 0; i < num_atomics; ++i) { + wait_threads.emplace_back(support::make_test_jthread(wait_func, i)); + } + + for (auto _ : state) { + done_count = 0; + start_flag.fetch_add(1); + start_flag.notify_all(); + while (done_count < num_atomics) { + std::this_thread::yield(); + } + } + for (auto& t : wait_threads) { + t.request_stop(); + } + start_flag.fetch_add(1); + start_flag.notify_all(); + for (auto& t : wait_threads) { + t.join(); + } +} +BENCHMARK(BM_atomic_wait_multi_thread_wait_different_atomics)->RangeMultiplier(2)->Range(1 << 10, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp b/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp new file mode 100644 index 0000000000000..c60fcd579488c --- /dev/null +++ b/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp @@ -0,0 +1,109 @@ +//===----------------------------------------------------------------------===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// To run this test, build libcxx and cxx-benchmarks targets +// cd third-party/benchmark/tools +// ./compare.py filters ../../../build/libcxx/benchmarks/atomic_wait_vs_mutex_lock.libcxx.out BM_atomic_wait BM_mutex + +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "make_test_thread.h" + +using namespace std::chrono_literals; + +struct AtomicLock { + std::atomic& locked_; + + AtomicLock(const AtomicLock&) = delete; + AtomicLock& operator=(const AtomicLock&) = delete; + + AtomicLock(std::atomic& l) : locked_(l) { lock(); } + ~AtomicLock() { unlock(); } + + void lock() { + while (true) { + locked_.wait(true, std::memory_order_relaxed); + bool expected = false; + if (locked_.compare_exchange_weak(expected, true, std::memory_order_acquire, std::memory_order_relaxed)) + break; + } + } + + void unlock() { + locked_.store(false, std::memory_order_release); + locked_.notify_all(); + } +}; + +// using LockState = std::atomic; +// using Lock = AtomicLock; + +// using LockState = std::mutex; +// using Lock = std::unique_lock; + +template +void test_multi_thread_lock_unlock(benchmark::State& state) { + std::uint64_t total_loop_test_param = state.range(0); + constexpr auto num_threads = 15; + std::vector threads; + threads.reserve(num_threads); + + std::atomic start_flag = 0; + std::atomic done_count = 0; + + LockState lock_state{}; + + auto func = [&start_flag, &done_count, &lock_state, total_loop_test_param](std::stop_token st) { + auto old_start = 0; + while (!st.stop_requested()) { + start_flag.wait(old_start); + old_start = start_flag.load(); + + // main things under test: locking and unlocking in the loop + for (std::uint64_t i = 0; i < total_loop_test_param; ++i) { + Lock l{lock_state}; + } + + done_count.fetch_add(1); + } + }; + + for (size_t i = 0; i < num_threads; ++i) { + threads.emplace_back(support::make_test_jthread(func)); + } + + for (auto _ : state) { + done_count = 0; + start_flag.fetch_add(1); + start_flag.notify_all(); + while (done_count < num_threads) { + std::this_thread::yield(); + } + } + for (auto& t : threads) { + t.request_stop(); + } + start_flag.fetch_add(1); + start_flag.notify_all(); + for (auto& t : threads) { + t.join(); + } +} + +void BM_atomic_wait(benchmark::State& state) { test_multi_thread_lock_unlock, AtomicLock>(state); } +BENCHMARK(BM_atomic_wait)->RangeMultiplier(2)->Range(1 << 10, 1 << 20); + +void BM_mutex(benchmark::State& state) { + test_multi_thread_lock_unlock>(state); +} +BENCHMARK(BM_mutex)->RangeMultiplier(2)->Range(1 << 10, 1 << 20); + +BENCHMARK_MAIN(); From 98a07f72eefb43476ca9e7af3178879d6ef71464 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 12:02:06 +0000 Subject: [PATCH 094/351] [X86] LowerCTPOP - "ctpop(i2 x) --> sub(x, (x >> 1))" If we only have 2 active bits then we can avoid the i8 CTPOP multiply expansion entirely Another expansion pulled from #79823 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +++++++ llvm/test/CodeGen/X86/ctpop-mask.ll | 44 +++++++++++-------------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ac2d1c76980ad..834b470a4a867 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31053,6 +31053,18 @@ static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget, unsigned ActiveBits = Known.getBitWidth() - LZ; unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ); + // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))". + if (ShiftedActiveBits <= 2) { + if (ActiveBits > 2) + Op = DAG.getNode(ISD::SRL, DL, VT, Op, + DAG.getShiftAmountConstant(TZ, VT, DL)); + Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); + Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op, + DAG.getNode(ISD::SRL, DL, MVT::i32, Op, + DAG.getShiftAmountConstant(1, VT, DL))); + return DAG.getZExtOrTrunc(Op, DL, VT); + } + // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply. if (ShiftedActiveBits <= 8) { SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32); diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll index 602d9b511cdc0..4b03563fd9924 100644 --- a/llvm/test/CodeGen/X86/ctpop-mask.ll +++ b/llvm/test/CodeGen/X86/ctpop-mask.ll @@ -33,22 +33,19 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone { ; X86-NO-POPCOUNT: # %bb.0: ; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-POPCOUNT-NEXT: andl $3, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: movl %eax, %ecx +; X86-NO-POPCOUNT-NEXT: shrl %ecx +; X86-NO-POPCOUNT-NEXT: subl %ecx, %eax ; X86-NO-POPCOUNT-NEXT: xorl %edx, %edx ; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask2: ; X64-NO-POPCOUNT: # %bb.0: -; X64-NO-POPCOUNT-NEXT: andl $3, %edi -; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 -; X64-NO-POPCOUNT-NEXT: shrl $3, %eax -; X64-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X64-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X64-NO-POPCOUNT-NEXT: shrl $28, %eax +; X64-NO-POPCOUNT-NEXT: movq %rdi, %rax +; X64-NO-POPCOUNT-NEXT: andl $3, %eax +; X64-NO-POPCOUNT-NEXT: movl %eax, %ecx +; X64-NO-POPCOUNT-NEXT: shrl %ecx +; X64-NO-POPCOUNT-NEXT: subl %ecx, %eax ; X64-NO-POPCOUNT-NEXT: retq %mask = and i64 %x, 3 %count = tail call i64 @llvm.ctpop.i64(i64 %mask) @@ -71,25 +68,22 @@ define i32 @ctpop_shifted_mask2(i32 %x) nounwind readnone { ; ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask2: ; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: movl $1572864, %eax # imm = 0x180000 +; X86-NO-POPCOUNT-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NO-POPCOUNT-NEXT: movl %eax, %ecx +; X86-NO-POPCOUNT-NEXT: shrl $20, %ecx ; X86-NO-POPCOUNT-NEXT: shrl $19, %eax -; X86-NO-POPCOUNT-NEXT: andl $3, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: subl %ecx, %eax ; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask2: ; X64-NO-POPCOUNT: # %bb.0: -; X64-NO-POPCOUNT-NEXT: shrl $19, %edi -; X64-NO-POPCOUNT-NEXT: andl $3, %edi -; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 -; X64-NO-POPCOUNT-NEXT: shrl $3, %eax -; X64-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X64-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X64-NO-POPCOUNT-NEXT: shrl $28, %eax +; X64-NO-POPCOUNT-NEXT: movl %edi, %eax +; X64-NO-POPCOUNT-NEXT: andl $1572864, %eax # imm = 0x180000 +; X64-NO-POPCOUNT-NEXT: movl %eax, %ecx +; X64-NO-POPCOUNT-NEXT: shrl $20, %ecx +; X64-NO-POPCOUNT-NEXT: shrl $19, %eax +; X64-NO-POPCOUNT-NEXT: subl %ecx, %eax ; X64-NO-POPCOUNT-NEXT: retq %mask = and i32 %x, 1572864 ; 3 << 19 %count = tail call i32 @llvm.ctpop.i32(i32 %mask) From b8c9b0613465b2770d2ae7f61364ddce6bba4511 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 13:38:30 +0000 Subject: [PATCH 095/351] [X86] LowerCTPOP - add i3 and i4 LUT 'shift+mask' expansions Use the 3 or 4 active bits as a shift amount into a i32/i64 constant representing the number of set bits. In future, it might be worthwhile to move this into a generic location in case other targets want to make use of them. Another expansion pulled from #79823 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 32 +++++++++++ llvm/test/CodeGen/X86/ctpop-mask.ll | 75 +++++++++++++------------ 2 files changed, 70 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 834b470a4a867..a86f13135173b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31065,6 +31065,38 @@ static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget, return DAG.getZExtOrTrunc(Op, DL, VT); } + // i3 CTPOP - perform LUT into i32 integer. + if (ShiftedActiveBits <= 3) { + if (ActiveBits > 3) + Op = DAG.getNode(ISD::SRL, DL, VT, Op, + DAG.getShiftAmountConstant(TZ, VT, DL)); + Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); + Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op, + DAG.getShiftAmountConstant(1, VT, DL)); + Op = DAG.getNode(ISD::SRL, DL, MVT::i32, + DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op); + Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, + DAG.getConstant(0x3, DL, MVT::i32)); + return DAG.getZExtOrTrunc(Op, DL, VT); + } + + // i4 CTPOP - perform LUT into i64 integer. + if (ShiftedActiveBits <= 4 && + DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) { + SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64); + if (ActiveBits > 4) + Op = DAG.getNode(ISD::SRL, DL, VT, Op, + DAG.getShiftAmountConstant(TZ, VT, DL)); + Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); + Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, + DAG.getConstant(4, DL, MVT::i32)); + Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT, + DAG.getShiftAmountOperand(MVT::i64, Op)); + Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op, + DAG.getConstant(0x7, DL, MVT::i64)); + return DAG.getZExtOrTrunc(Op, DL, VT); + } + // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply. if (ShiftedActiveBits <= 8) { SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32); diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll index 4b03563fd9924..a43dba94d30c7 100644 --- a/llvm/test/CodeGen/X86/ctpop-mask.ll +++ b/llvm/test/CodeGen/X86/ctpop-mask.ll @@ -106,23 +106,24 @@ define i32 @ctpop_mask3(i32 %x) nounwind readnone { ; ; X86-NO-POPCOUNT-LABEL: ctpop_mask3: ; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: andl $5, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-POPCOUNT-NEXT: andl $5, %ecx +; X86-NO-POPCOUNT-NEXT: addl %ecx, %ecx +; X86-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994 +; X86-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-POPCOUNT-NEXT: shrl %cl, %eax +; X86-NO-POPCOUNT-NEXT: andl $3, %eax ; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_mask3: ; X64-NO-POPCOUNT: # %bb.0: +; X64-NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NO-POPCOUNT-NEXT: andl $5, %edi -; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 -; X64-NO-POPCOUNT-NEXT: shrl $3, %eax -; X64-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X64-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X64-NO-POPCOUNT-NEXT: shrl $28, %eax +; X64-NO-POPCOUNT-NEXT: leal (%rdi,%rdi), %ecx +; X64-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994 +; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-POPCOUNT-NEXT: shrl %cl, %eax +; X64-NO-POPCOUNT-NEXT: andl $3, %eax ; X64-NO-POPCOUNT-NEXT: retq %mask = and i32 %x, 5 ; 0b101 %count = tail call i32 @llvm.ctpop.i32(i32 %mask) @@ -147,24 +148,23 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone { ; ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask3: ; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NO-POPCOUNT-NEXT: andl $14, %eax -; X86-NO-POPCOUNT-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 -; X86-NO-POPCOUNT-NEXT: shrl $3, %eax -; X86-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X86-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X86-NO-POPCOUNT-NEXT: shrl $28, %eax +; X86-NO-POPCOUNT-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NO-POPCOUNT-NEXT: andl $14, %ecx +; X86-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994 +; X86-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-POPCOUNT-NEXT: shrl %cl, %eax +; X86-NO-POPCOUNT-NEXT: andl $3, %eax ; X86-NO-POPCOUNT-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NO-POPCOUNT-NEXT: retl ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask3: ; X64-NO-POPCOUNT: # %bb.0: -; X64-NO-POPCOUNT-NEXT: andl $14, %edi -; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 -; X64-NO-POPCOUNT-NEXT: shrl $3, %eax -; X64-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X64-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X64-NO-POPCOUNT-NEXT: shrl $28, %eax +; X64-NO-POPCOUNT-NEXT: movl %edi, %ecx +; X64-NO-POPCOUNT-NEXT: andl $14, %ecx +; X64-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994 +; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-POPCOUNT-NEXT: shrl %cl, %eax +; X64-NO-POPCOUNT-NEXT: andl $3, %eax ; X64-NO-POPCOUNT-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NO-POPCOUNT-NEXT: retq %mask = and i16 %x, 14 ; 7 << 1 @@ -202,11 +202,11 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone { ; X64-NO-POPCOUNT-LABEL: ctpop_mask4: ; X64-NO-POPCOUNT: # %bb.0: ; X64-NO-POPCOUNT-NEXT: andl $15, %edi -; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 -; X64-NO-POPCOUNT-NEXT: shrl $3, %eax -; X64-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X64-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X64-NO-POPCOUNT-NEXT: shrl $28, %eax +; X64-NO-POPCOUNT-NEXT: leal (,%rdi,4), %ecx +; X64-NO-POPCOUNT-NEXT: movabsq $4841987667533046032, %rax # imm = 0x4332322132212110 +; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-POPCOUNT-NEXT: shrq %cl, %rax +; X64-NO-POPCOUNT-NEXT: andl $7, %eax ; X64-NO-POPCOUNT-NEXT: retq %mask = and i64 %x, 15 %count = tail call i64 @llvm.ctpop.i64(i64 %mask) @@ -241,13 +241,14 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone { ; ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask4: ; X64-NO-POPCOUNT: # %bb.0: -; X64-NO-POPCOUNT-NEXT: shrl $9, %edi -; X64-NO-POPCOUNT-NEXT: andl $15, %edi -; X64-NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 -; X64-NO-POPCOUNT-NEXT: shrl $3, %eax -; X64-NO-POPCOUNT-NEXT: andl $17895697, %eax # imm = 0x1111111 -; X64-NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 -; X64-NO-POPCOUNT-NEXT: shrl $28, %eax +; X64-NO-POPCOUNT-NEXT: movl %edi, %ecx +; X64-NO-POPCOUNT-NEXT: shrl $7, %ecx +; X64-NO-POPCOUNT-NEXT: andl $60, %ecx +; X64-NO-POPCOUNT-NEXT: movabsq $4841987667533046032, %rax # imm = 0x4332322132212110 +; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-POPCOUNT-NEXT: shrq %cl, %rax +; X64-NO-POPCOUNT-NEXT: andl $7, %eax +; X64-NO-POPCOUNT-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NO-POPCOUNT-NEXT: retq %mask = and i32 %x, 7680 ; 15 << 9 %count = tail call i32 @llvm.ctpop.i32(i32 %mask) From 88a18556aeeaf70315990ed9fb23c28834edf454 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 13:39:44 +0000 Subject: [PATCH 096/351] Fix MSVC signed/unsigned mismatch warning --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 812bb26f201a0..7d6448dc35f9a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13451,7 +13451,7 @@ struct NodeExtensionHelper { ? MVT::getFloatingPointVT(NarrowSize) : MVT::getIntegerVT(NarrowSize); - assert(NarrowSize >= (SupportsExt == ExtKind::FPExt ? 16 : 8) && + assert((int)NarrowSize >= (SupportsExt == ExtKind::FPExt ? 16 : 8) && "Trying to extend something we can't represent"); MVT NarrowVT = MVT::getVectorVT(EltVT, VT.getVectorElementCount()); return NarrowVT; From e3d4cac2bcbdb719a0d29055c8c60df0e98e0126 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 13:40:43 +0000 Subject: [PATCH 097/351] Fix MSVC "not all control paths return a value" warning --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7d6448dc35f9a..f7275eb7c77bb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13397,6 +13397,7 @@ struct NodeExtensionHelper { case ExtKind::FPExt: return RISCVISD::FP_EXTEND_VL; } + llvm_unreachable("Unknown ExtKind enum"); } /// Get or create a value that can feed \p Root with the given extension \p From 1a7166833d38a2a5c26eacecd13833a9a15e3b3d Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Wed, 21 Feb 2024 13:58:04 +0000 Subject: [PATCH 098/351] [AArch64] Fix stack probing clobbering flags (#81879) Certain stack probing sequences might clobber flags, then we can't use a block as a prologue if the flags register is a live-in on entry to that block. --- .../Target/AArch64/AArch64FrameLowering.cpp | 6 + .../AArch64/stack-probing-shrink-wrap.mir | 107 ++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index d98750e09d4e3..3485edb69c910 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1061,6 +1061,12 @@ bool AArch64FrameLowering::canUseAsPrologue( return false; } + // Certain stack probing sequences might clobber flags, then we can't use + // the block as a prologue if the flags register is a live-in. + if (MF->getInfo()->hasStackProbing() && + MBB.isLiveIn(AArch64::NZCV)) + return false; + // Don't need a scratch register if we're not going to re-align the stack or // emit stack probes. if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF)) diff --git a/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir b/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir new file mode 100644 index 0000000000000..83aa90d389a4a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir @@ -0,0 +1,107 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc %s --start-before=shrink-wrap --stop-after=prologepilog -o - | FileCheck %s +--- | + target triple = "aarch64-linux" + + define void @f(i32 %n) #0 { + entry: + %a = alloca i8, i32 150000, align 8 + %c0 = icmp sle i32 %n, 1 + br i1 %c0, label %if.then1, label %exit + + if.then1: ; preds = %entry + %0 = icmp sle i32 %n, 1 + %v = select i1 %0, i32 0, i32 1 + call void @g(ptr %a, i32 %v) + br label %exit + + exit: ; preds = %if.then1, %entry + ret void + } + + declare void @g(...) + + attributes #0 = { nounwind "probe-stack"="inline-asm" "stack-probe-size"="4096" } + +... +--- +name: f +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + localFrameSize: 150000 +stack: + - { id: 0, name: a, type: default, offset: 0, size: 150000, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -150000, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: f + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $w0, $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1) + ; CHECK-NEXT: $x9 = frame-setup SUBXri $sp, 36, 12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.entry: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $w0, $x9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1, 12 + ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 + ; CHECK-NEXT: $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv + ; CHECK-NEXT: frame-setup Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.entry: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2544, 0 + ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 + ; CHECK-NEXT: dead $wzr = SUBSWri killed renamable $w0, 1, 0, implicit-def $nzcv + ; CHECK-NEXT: Bcc 12, %bb.2, implicit $nzcv + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.then1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $w1 = CSINCWr $wzr, $wzr, 13, implicit killed $nzcv + ; CHECK-NEXT: $x0 = ADDXri $sp, 0, 0 + ; CHECK-NEXT: BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit-def $sp + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.exit: + ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 36, 12 + ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2544, 0 + ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1) + ; CHECK-NEXT: RET_ReallyLR + bb.0.entry: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $w0 + + dead $wzr = SUBSWri killed renamable $w0, 1, 0, implicit-def $nzcv + Bcc 12, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.if.then1: + successors: %bb.2(0x80000000) + liveins: $nzcv + + renamable $w1 = CSINCWr $wzr, $wzr, 13, implicit killed $nzcv + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $x0 = ADDXri %stack.0.a, 0, 0 + BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + + bb.2.exit: + RET_ReallyLR + +... From 61bc5f6c7383ec7d8a0e847abcd56ddc02ee77bf Mon Sep 17 00:00:00 2001 From: harishch4 Date: Wed, 21 Feb 2024 19:38:27 +0530 Subject: [PATCH 099/351] [Flang]: Fix to bind(C) procs inside BLOCK construct (#82483) Name mangling is invoked for a bind(C) procedure contained in a block in a context that does not have access to block ID mapping. Relaxing an assert to account for this. Fixes #79408 --- flang/lib/Lower/Mangler.cpp | 3 ++- flang/test/Lower/HLFIR/block_bindc_pocs.f90 | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/HLFIR/block_bindc_pocs.f90 diff --git a/flang/lib/Lower/Mangler.cpp b/flang/lib/Lower/Mangler.cpp index 24abbce01c059..9a33be318a27d 100644 --- a/flang/lib/Lower/Mangler.cpp +++ b/flang/lib/Lower/Mangler.cpp @@ -182,7 +182,8 @@ Fortran::lower::mangle::mangleName(const Fortran::semantics::Symbol &symbol, bool underscoring) { assert((symbol.owner().kind() != Fortran::semantics::Scope::Kind::BlockConstruct || - symbol.has()) && + symbol.has() || + Fortran::semantics::IsBindCProcedure(symbol)) && "block object mangling must specify a scopeBlockIdMap"); ScopeBlockIdMap scopeBlockIdMap; return mangleName(symbol, scopeBlockIdMap, keepExternalInScope, underscoring); diff --git a/flang/test/Lower/HLFIR/block_bindc_pocs.f90 b/flang/test/Lower/HLFIR/block_bindc_pocs.f90 new file mode 100644 index 0000000000000..cfec45cfcd854 --- /dev/null +++ b/flang/test/Lower/HLFIR/block_bindc_pocs.f90 @@ -0,0 +1,20 @@ +! This test checks bind(c) procs inside BLOCK construct. + +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +module m + interface + subroutine test_proc() bind(C) + end subroutine test_proc + end interface +end module m +!CHECK-DAG: %[[S0:.*]] = fir.call @llvm.stacksave.p0() fastmath : () -> !fir.ref +!CHECK-DAG: fir.call @test_proc() fastmath : () -> () +!CHECK-DAG: fir.call @llvm.stackrestore.p0(%[[S0]]) fastmath : (!fir.ref) -> () +!CHECK-DAG: func.func private @test_proc() attributes {fir.bindc_name = "test_proc"} +subroutine test + BLOCK + use m + call test_proc + END BLOCK +end subroutine test From 91ebd010aa76a711abd88f74ecca8e82e15b23cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 21 Feb 2024 08:12:31 +0100 Subject: [PATCH 100/351] [clang][Interp] Remove dereference() This function tried to be smart about the dereferenced value, but it ended up hurting more than it helped. At least in the current state, where we still try get the correct output. I might add something similar back later. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 159 ++--------------------- clang/lib/AST/Interp/ByteCodeExprGen.h | 23 ---- clang/lib/AST/Interp/Interp.cpp | 12 +- 3 files changed, 18 insertions(+), 176 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 70e2bca2ebf16..d11d05dd709d5 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -81,16 +81,15 @@ bool ByteCodeExprGen::VisitCastExpr(const CastExpr *CE) { if (DiscardResult) return this->discard(SubExpr); - return dereference( - SubExpr, DerefKind::Read, - [](PrimType) { - // Value loaded - nothing to do here. - return true; - }, - [this, CE](PrimType T) { - // Pointer on stack - dereference it. - return this->emitLoadPop(T, CE); - }); + if (SubExpr->getType()->isAnyComplexType()) + return this->delegate(SubExpr); + + if (!this->visit(SubExpr)) + return false; + + if (std::optional SubExprT = classify(SubExpr->getType())) + return this->emitLoadPop(*SubExprT, CE); + return false; } case CK_UncheckedDerivedToBase: @@ -2326,134 +2325,6 @@ bool ByteCodeExprGen::visitZeroRecordInitializer(const Record *R, return true; } -template -bool ByteCodeExprGen::dereference( - const Expr *LV, DerefKind AK, llvm::function_ref Direct, - llvm::function_ref Indirect) { - if (std::optional T = classify(LV->getType())) { - if (!LV->refersToBitField()) { - // Only primitive, non bit-field types can be dereferenced directly. - if (const auto *DE = dyn_cast(LV)) { - if (!DE->getDecl()->getType()->isReferenceType()) { - if (const auto *PD = dyn_cast(DE->getDecl())) - return dereferenceParam(LV, *T, PD, AK, Direct, Indirect); - if (const auto *VD = dyn_cast(DE->getDecl())) - return dereferenceVar(LV, *T, VD, AK, Direct, Indirect); - } - } - } - - if (!visit(LV)) - return false; - return Indirect(*T); - } - - if (LV->getType()->isAnyComplexType()) - return this->delegate(LV); - - return false; -} - -template -bool ByteCodeExprGen::dereferenceParam( - const Expr *LV, PrimType T, const ParmVarDecl *PD, DerefKind AK, - llvm::function_ref Direct, - llvm::function_ref Indirect) { - if (auto It = this->Params.find(PD); It != this->Params.end()) { - unsigned Idx = It->second.Offset; - switch (AK) { - case DerefKind::Read: - return DiscardResult ? true : this->emitGetParam(T, Idx, LV); - - case DerefKind::Write: - if (!Direct(T)) - return false; - if (!this->emitSetParam(T, Idx, LV)) - return false; - return DiscardResult ? true : this->emitGetPtrParam(Idx, LV); - - case DerefKind::ReadWrite: - if (!this->emitGetParam(T, Idx, LV)) - return false; - if (!Direct(T)) - return false; - if (!this->emitSetParam(T, Idx, LV)) - return false; - return DiscardResult ? true : this->emitGetPtrParam(Idx, LV); - } - return true; - } - - // If the param is a pointer, we can dereference a dummy value. - if (!DiscardResult && T == PT_Ptr && AK == DerefKind::Read) { - if (auto Idx = P.getOrCreateDummy(PD)) - return this->emitGetPtrGlobal(*Idx, PD); - return false; - } - - // Value cannot be produced - try to emit pointer and do stuff with it. - return visit(LV) && Indirect(T); -} - -template -bool ByteCodeExprGen::dereferenceVar( - const Expr *LV, PrimType T, const VarDecl *VD, DerefKind AK, - llvm::function_ref Direct, - llvm::function_ref Indirect) { - auto It = Locals.find(VD); - if (It != Locals.end()) { - const auto &L = It->second; - switch (AK) { - case DerefKind::Read: - if (!this->emitGetLocal(T, L.Offset, LV)) - return false; - return DiscardResult ? this->emitPop(T, LV) : true; - - case DerefKind::Write: - if (!Direct(T)) - return false; - if (!this->emitSetLocal(T, L.Offset, LV)) - return false; - return DiscardResult ? true : this->emitGetPtrLocal(L.Offset, LV); - - case DerefKind::ReadWrite: - if (!this->emitGetLocal(T, L.Offset, LV)) - return false; - if (!Direct(T)) - return false; - if (!this->emitSetLocal(T, L.Offset, LV)) - return false; - return DiscardResult ? true : this->emitGetPtrLocal(L.Offset, LV); - } - } else if (auto Idx = P.getGlobal(VD)) { - switch (AK) { - case DerefKind::Read: - if (!this->emitGetGlobal(T, *Idx, LV)) - return false; - return DiscardResult ? this->emitPop(T, LV) : true; - - case DerefKind::Write: - if (!Direct(T)) - return false; - if (!this->emitSetGlobal(T, *Idx, LV)) - return false; - return DiscardResult ? true : this->emitGetPtrGlobal(*Idx, LV); - - case DerefKind::ReadWrite: - if (!this->emitGetGlobal(T, *Idx, LV)) - return false; - if (!Direct(T)) - return false; - if (!this->emitSetGlobal(T, *Idx, LV)) - return false; - return DiscardResult ? true : this->emitGetPtrGlobal(*Idx, LV); - } - } - - // Value cannot be produced - try to emit pointer. - return visit(LV) && Indirect(T); -} - template template bool ByteCodeExprGen::emitConst(T Value, PrimType Ty, const Expr *E) { @@ -3092,15 +2963,9 @@ bool ByteCodeExprGen::VisitUnaryOperator(const UnaryOperator *E) { // We should already have a pointer when we get here. return this->delegate(SubExpr); case UO_Deref: // *x - return dereference( - SubExpr, DerefKind::Read, - [](PrimType) { - llvm_unreachable("Dereferencing requires a pointer"); - return false; - }, - [this, E](PrimType T) { - return DiscardResult ? this->emitPop(T, E) : true; - }); + if (DiscardResult) + return this->discard(SubExpr); + return this->visit(SubExpr); case UO_Not: // ~x if (!this->visit(SubExpr)) return false; diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h index 4c88e33b3ece4..8f7a0c2fc3c10 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.h +++ b/clang/lib/AST/Interp/ByteCodeExprGen.h @@ -236,29 +236,6 @@ class ByteCodeExprGen : public ConstStmtVisitor, bool>, bool visitZeroInitializer(PrimType T, QualType QT, const Expr *E); bool visitZeroRecordInitializer(const Record *R, const Expr *E); - enum class DerefKind { - /// Value is read and pushed to stack. - Read, - /// Direct method generates a value which is written. Returns pointer. - Write, - /// Direct method receives the value, pushes mutated value. Returns pointer. - ReadWrite, - }; - - /// Method to directly load a value. If the value can be fetched directly, - /// the direct handler is called. Otherwise, a pointer is left on the stack - /// and the indirect handler is expected to operate on that. - bool dereference(const Expr *LV, DerefKind AK, - llvm::function_ref Direct, - llvm::function_ref Indirect); - bool dereferenceParam(const Expr *LV, PrimType T, const ParmVarDecl *PD, - DerefKind AK, - llvm::function_ref Direct, - llvm::function_ref Indirect); - bool dereferenceVar(const Expr *LV, PrimType T, const VarDecl *PD, - DerefKind AK, llvm::function_ref Direct, - llvm::function_ref Indirect); - /// Emits an APSInt constant. bool emitConst(const llvm::APSInt &Value, PrimType Ty, const Expr *E); bool emitConst(const llvm::APSInt &Value, const Expr *E); diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 1a48b8bddced0..82bc1f240cc51 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -362,13 +362,13 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr, if (Ptr.isInitialized()) return true; + if (const auto *VD = Ptr.getDeclDesc()->asVarDecl(); + VD && VD->hasGlobalStorage()) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_var_init_non_constant, 1) << VD; + S.Note(VD->getLocation(), diag::note_declared_at); + } if (!S.checkingPotentialConstantExpression()) { - if (const auto *VD = Ptr.getDeclDesc()->asVarDecl(); - VD && VD->hasGlobalStorage()) { - const SourceInfo &Loc = S.Current->getSource(OpPC); - S.FFDiag(Loc, diag::note_constexpr_var_init_non_constant, 1) << VD; - S.Note(VD->getLocation(), diag::note_declared_at); - } S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_access_uninit) << AK << /*uninitialized=*/true << S.Current->getRange(OpPC); } From f7c2e5fa05d221a3dfc53744f353517407c2ffec Mon Sep 17 00:00:00 2001 From: Rajveer Singh Bharadwaj Date: Wed, 21 Feb 2024 20:00:49 +0530 Subject: [PATCH 101/351] [clang] [SemaCXX] Disallow deducing "this" on operator `new` and `delete` (#82251) Resolves Issue #82249 As described in the issue, any deallocation function for a `class X` is a static member (even if not explicitly declared static). --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaDeclCXX.cpp | 4 +++- clang/test/SemaCXX/cxx2b-deducing-this.cpp | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5bca2c965c866..c17298bc7bce5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -279,6 +279,8 @@ Bug Fixes to C++ Support Fixes (`#68490 `_) - Fix a crash when trying to call a varargs function that also has an explicit object parameter. Fixes (`#80971 ICE when explicit object parameter be a function parameter pack`) +- Reject explicit object parameters on `new` and `delete` operators. + Fixes (`#82249 ` _) - Fixed a bug where abbreviated function templates would append their invented template parameters to an empty template parameter lists. - Clang now classifies aggregate initialization in C++17 and newer as constant diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 79263bc3ff671..7c009d9c8ec09 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -11395,7 +11395,9 @@ void Sema::CheckExplicitObjectMemberFunction(Declarator &D, << ExplicitObjectParam->getSourceRange(); } - if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static) { + if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static || + (D.getContext() == clang::DeclaratorContext::Member && + D.isStaticMember())) { Diag(ExplicitObjectParam->getBeginLoc(), diag::err_explicit_object_parameter_nonmember) << D.getSourceRange() << /*static=*/0 << IsLambda; diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 30131d6adc4db..b8ddb9ad30003 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -16,6 +16,10 @@ struct S { static void f(this auto); // expected-error{{an explicit object parameter cannot appear in a static function}} virtual void f(this S); // expected-error{{an explicit object parameter cannot appear in a virtual function}} + // new and delete are implicitly static + void *operator new(this unsigned long); // expected-error{{an explicit object parameter cannot appear in a static function}} + void operator delete(this void*); // expected-error{{an explicit object parameter cannot appear in a static function}} + void g(this auto) const; // expected-error{{explicit object member function cannot have 'const' qualifier}} void h(this auto) &; // expected-error{{explicit object member function cannot have '&' qualifier}} void i(this auto) &&; // expected-error{{explicit object member function cannot have '&&' qualifier}} From 6ba8ca8c1600ce33274e4f22397bd4d400f0ad8d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 14:25:17 +0000 Subject: [PATCH 102/351] [CostModel][X86] Don't use undef for icmp cost tests Cleanup prior to #80122 fix - using undef means we think that the comparison is with a Constant --- .../Analysis/CostModel/X86/icmp-codesize.ll | 3540 +++++------ .../Analysis/CostModel/X86/icmp-latency.ll | 3940 ++++++------- .../CostModel/X86/icmp-sizelatency.ll | 3540 +++++------ llvm/test/Analysis/CostModel/X86/icmp.ll | 5220 ++++++++--------- 4 files changed, 8120 insertions(+), 8120 deletions(-) diff --git a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll index 511ef81629c1d..d952e9896d215 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll @@ -15,2076 +15,2076 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 -define i32 @cmp_int_eq(i32 %arg) { +define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_eq' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_eq' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_eq' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_eq' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_eq' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_eq' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_eq' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_eq' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp eq i8 undef, undef - %V16I8 = icmp eq <16 x i8> undef, undef - %V32I8 = icmp eq <32 x i8> undef, undef - %V64I8 = icmp eq <64 x i8> undef, undef - %V128I8 = icmp eq <128 x i8> undef, undef + %I8 = icmp eq i8 %arg8, %arg8 + %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp eq i16 undef, undef - %V8I16 = icmp eq <8 x i16> undef, undef - %V16I16 = icmp eq <16 x i16> undef, undef - %V32I16 = icmp eq <32 x i16> undef, undef - %V64I16 = icmp eq <64 x i16> undef, undef + %I16 = icmp eq i16 %arg16, %arg16 + %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp eq i32 undef, undef - %V4I32 = icmp eq <4 x i32> undef, undef - %V8I32 = icmp eq <8 x i32> undef, undef - %V16I32 = icmp eq <16 x i32> undef, undef - %V32I32 = icmp eq <32 x i32> undef, undef + %I32 = icmp eq i32 %arg32, %arg32 + %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp eq i64 undef, undef - %V2I64 = icmp eq <2 x i64> undef, undef - %V4I64 = icmp eq <4 x i64> undef, undef - %V8I64 = icmp eq <8 x i64> undef, undef - %V16I64 = icmp eq <16 x i64> undef, undef + %I64 = icmp eq i64 %arg64, %arg64 + %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ne(i32 %arg) { +define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ne' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ne' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ne' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ne' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ne' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ne' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ne' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ne' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ne i8 undef, undef - %V16I8 = icmp ne <16 x i8> undef, undef - %V32I8 = icmp ne <32 x i8> undef, undef - %V64I8 = icmp ne <64 x i8> undef, undef - %V128I8 = icmp ne <128 x i8> undef, undef + %I8 = icmp ne i8 %arg8, %arg8 + %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ne i16 undef, undef - %V8I16 = icmp ne <8 x i16> undef, undef - %V16I16 = icmp ne <16 x i16> undef, undef - %V32I16 = icmp ne <32 x i16> undef, undef - %V64I16 = icmp ne <64 x i16> undef, undef + %I16 = icmp ne i16 %arg16, %arg16 + %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ne i32 undef, undef - %V4I32 = icmp ne <4 x i32> undef, undef - %V8I32 = icmp ne <8 x i32> undef, undef - %V16I32 = icmp ne <16 x i32> undef, undef - %V32I32 = icmp ne <32 x i32> undef, undef + %I32 = icmp ne i32 %arg32, %arg32 + %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ne i64 undef, undef - %V2I64 = icmp ne <2 x i64> undef, undef - %V4I64 = icmp ne <4 x i64> undef, undef - %V8I64 = icmp ne <8 x i64> undef, undef - %V16I64 = icmp ne <16 x i64> undef, undef + %I64 = icmp ne i64 %arg64, %arg64 + %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sge(i32 %arg) { +define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sge' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sge i8 undef, undef - %V16I8 = icmp sge <16 x i8> undef, undef - %V32I8 = icmp sge <32 x i8> undef, undef - %V64I8 = icmp sge <64 x i8> undef, undef - %V128I8 = icmp sge <128 x i8> undef, undef + %I8 = icmp sge i8 %arg8, %arg8 + %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sge i16 undef, undef - %V8I16 = icmp sge <8 x i16> undef, undef - %V16I16 = icmp sge <16 x i16> undef, undef - %V32I16 = icmp sge <32 x i16> undef, undef - %V64I16 = icmp sge <64 x i16> undef, undef + %I16 = icmp sge i16 %arg16, %arg16 + %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sge i32 undef, undef - %V4I32 = icmp sge <4 x i32> undef, undef - %V8I32 = icmp sge <8 x i32> undef, undef - %V16I32 = icmp sge <16 x i32> undef, undef - %V32I32 = icmp sge <32 x i32> undef, undef + %I32 = icmp sge i32 %arg32, %arg32 + %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sge i64 undef, undef - %V2I64 = icmp sge <2 x i64> undef, undef - %V4I64 = icmp sge <4 x i64> undef, undef - %V8I64 = icmp sge <8 x i64> undef, undef - %V16I64 = icmp sge <16 x i64> undef, undef + %I64 = icmp sge i64 %arg64, %arg64 + %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_uge(i32 %arg) { +define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE42-LABEL: 'cmp_int_uge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_uge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_uge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_uge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_uge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_uge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_uge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp uge i8 undef, undef - %V16I8 = icmp uge <16 x i8> undef, undef - %V32I8 = icmp uge <32 x i8> undef, undef - %V64I8 = icmp uge <64 x i8> undef, undef - %V128I8 = icmp uge <128 x i8> undef, undef + %I8 = icmp uge i8 %arg8, %arg8 + %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp uge i16 undef, undef - %V8I16 = icmp uge <8 x i16> undef, undef - %V16I16 = icmp uge <16 x i16> undef, undef - %V32I16 = icmp uge <32 x i16> undef, undef - %V64I16 = icmp uge <64 x i16> undef, undef + %I16 = icmp uge i16 %arg16, %arg16 + %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp uge i32 undef, undef - %V4I32 = icmp uge <4 x i32> undef, undef - %V8I32 = icmp uge <8 x i32> undef, undef - %V16I32 = icmp uge <16 x i32> undef, undef - %V32I32 = icmp uge <32 x i32> undef, undef + %I32 = icmp uge i32 %arg32, %arg32 + %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp uge i64 undef, undef - %V2I64 = icmp uge <2 x i64> undef, undef - %V4I64 = icmp uge <4 x i64> undef, undef - %V8I64 = icmp uge <8 x i64> undef, undef - %V16I64 = icmp uge <16 x i64> undef, undef + %I64 = icmp uge i64 %arg64, %arg64 + %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sgt(i32 %arg) { +define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sgt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sgt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sgt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sgt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sgt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sgt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sgt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sgt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sgt i8 undef, undef - %V16I8 = icmp sgt <16 x i8> undef, undef - %V32I8 = icmp sgt <32 x i8> undef, undef - %V64I8 = icmp sgt <64 x i8> undef, undef - %V128I8 = icmp sgt <128 x i8> undef, undef + %I8 = icmp sgt i8 %arg8, %arg8 + %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sgt i16 undef, undef - %V8I16 = icmp sgt <8 x i16> undef, undef - %V16I16 = icmp sgt <16 x i16> undef, undef - %V32I16 = icmp sgt <32 x i16> undef, undef - %V64I16 = icmp sgt <64 x i16> undef, undef + %I16 = icmp sgt i16 %arg16, %arg16 + %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sgt i32 undef, undef - %V4I32 = icmp sgt <4 x i32> undef, undef - %V8I32 = icmp sgt <8 x i32> undef, undef - %V16I32 = icmp sgt <16 x i32> undef, undef - %V32I32 = icmp sgt <32 x i32> undef, undef + %I32 = icmp sgt i32 %arg32, %arg32 + %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sgt i64 undef, undef - %V2I64 = icmp sgt <2 x i64> undef, undef - %V4I64 = icmp sgt <4 x i64> undef, undef - %V8I64 = icmp sgt <8 x i64> undef, undef - %V16I64 = icmp sgt <16 x i64> undef, undef + %I64 = icmp sgt i64 %arg64, %arg64 + %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ugt(i32 %arg) { +define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ugt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ugt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ugt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ugt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ugt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ugt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ugt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ugt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ugt i8 undef, undef - %V16I8 = icmp ugt <16 x i8> undef, undef - %V32I8 = icmp ugt <32 x i8> undef, undef - %V64I8 = icmp ugt <64 x i8> undef, undef - %V128I8 = icmp ugt <128 x i8> undef, undef + %I8 = icmp ugt i8 %arg8, %arg8 + %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ugt i16 undef, undef - %V8I16 = icmp ugt <8 x i16> undef, undef - %V16I16 = icmp ugt <16 x i16> undef, undef - %V32I16 = icmp ugt <32 x i16> undef, undef - %V64I16 = icmp ugt <64 x i16> undef, undef + %I16 = icmp ugt i16 %arg16, %arg16 + %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ugt i32 undef, undef - %V4I32 = icmp ugt <4 x i32> undef, undef - %V8I32 = icmp ugt <8 x i32> undef, undef - %V16I32 = icmp ugt <16 x i32> undef, undef - %V32I32 = icmp ugt <32 x i32> undef, undef + %I32 = icmp ugt i32 %arg32, %arg32 + %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ugt i64 undef, undef - %V2I64 = icmp ugt <2 x i64> undef, undef - %V4I64 = icmp ugt <4 x i64> undef, undef - %V8I64 = icmp ugt <8 x i64> undef, undef - %V16I64 = icmp ugt <16 x i64> undef, undef + %I64 = icmp ugt i64 %arg64, %arg64 + %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sle(i32 %arg) { +define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sle' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sle' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sle' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sle' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sle' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sle' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sle' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sle' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sle i8 undef, undef - %V16I8 = icmp sle <16 x i8> undef, undef - %V32I8 = icmp sle <32 x i8> undef, undef - %V64I8 = icmp sle <64 x i8> undef, undef - %V128I8 = icmp sle <128 x i8> undef, undef + %I8 = icmp sle i8 %arg8, %arg8 + %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sle i16 undef, undef - %V8I16 = icmp sle <8 x i16> undef, undef - %V16I16 = icmp sle <16 x i16> undef, undef - %V32I16 = icmp sle <32 x i16> undef, undef - %V64I16 = icmp sle <64 x i16> undef, undef + %I16 = icmp sle i16 %arg16, %arg16 + %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sle i32 undef, undef - %V4I32 = icmp sle <4 x i32> undef, undef - %V8I32 = icmp sle <8 x i32> undef, undef - %V16I32 = icmp sle <16 x i32> undef, undef - %V32I32 = icmp sle <32 x i32> undef, undef + %I32 = icmp sle i32 %arg32, %arg32 + %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sle i64 undef, undef - %V2I64 = icmp sle <2 x i64> undef, undef - %V4I64 = icmp sle <4 x i64> undef, undef - %V8I64 = icmp sle <8 x i64> undef, undef - %V16I64 = icmp sle <16 x i64> undef, undef + %I64 = icmp sle i64 %arg64, %arg64 + %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ule(i32 %arg) { +define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE42-LABEL: 'cmp_int_ule' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ule' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ule' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ule' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ule' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ule' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ule' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ule i8 undef, undef - %V16I8 = icmp ule <16 x i8> undef, undef - %V32I8 = icmp ule <32 x i8> undef, undef - %V64I8 = icmp ule <64 x i8> undef, undef - %V128I8 = icmp ule <128 x i8> undef, undef + %I8 = icmp ule i8 %arg8, %arg8 + %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ule i16 undef, undef - %V8I16 = icmp ule <8 x i16> undef, undef - %V16I16 = icmp ule <16 x i16> undef, undef - %V32I16 = icmp ule <32 x i16> undef, undef - %V64I16 = icmp ule <64 x i16> undef, undef + %I16 = icmp ule i16 %arg16, %arg16 + %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ule i32 undef, undef - %V4I32 = icmp ule <4 x i32> undef, undef - %V8I32 = icmp ule <8 x i32> undef, undef - %V16I32 = icmp ule <16 x i32> undef, undef - %V32I32 = icmp ule <32 x i32> undef, undef + %I32 = icmp ule i32 %arg32, %arg32 + %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ule i64 undef, undef - %V2I64 = icmp ule <2 x i64> undef, undef - %V4I64 = icmp ule <4 x i64> undef, undef - %V8I64 = icmp ule <8 x i64> undef, undef - %V16I64 = icmp ule <16 x i64> undef, undef + %I64 = icmp ule i64 %arg64, %arg64 + %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_slt(i32 %arg) { +define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_slt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_slt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_slt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_slt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_slt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_slt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_slt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_slt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp slt i8 undef, undef - %V16I8 = icmp slt <16 x i8> undef, undef - %V32I8 = icmp slt <32 x i8> undef, undef - %V64I8 = icmp slt <64 x i8> undef, undef - %V128I8 = icmp slt <128 x i8> undef, undef + %I8 = icmp slt i8 %arg8, %arg8 + %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp slt i16 undef, undef - %V8I16 = icmp slt <8 x i16> undef, undef - %V16I16 = icmp slt <16 x i16> undef, undef - %V32I16 = icmp slt <32 x i16> undef, undef - %V64I16 = icmp slt <64 x i16> undef, undef + %I16 = icmp slt i16 %arg16, %arg16 + %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp slt i32 undef, undef - %V4I32 = icmp slt <4 x i32> undef, undef - %V8I32 = icmp slt <8 x i32> undef, undef - %V16I32 = icmp slt <16 x i32> undef, undef - %V32I32 = icmp slt <32 x i32> undef, undef + %I32 = icmp slt i32 %arg32, %arg32 + %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp slt i64 undef, undef - %V2I64 = icmp slt <2 x i64> undef, undef - %V4I64 = icmp slt <4 x i64> undef, undef - %V8I64 = icmp slt <8 x i64> undef, undef - %V16I64 = icmp slt <16 x i64> undef, undef + %I64 = icmp slt i64 %arg64, %arg64 + %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ult(i32 %arg) { +define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ult' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ult' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ult' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ult' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ult' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ult' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ult' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ult' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ult i8 undef, undef - %V16I8 = icmp ult <16 x i8> undef, undef - %V32I8 = icmp ult <32 x i8> undef, undef - %V64I8 = icmp ult <64 x i8> undef, undef - %V128I8 = icmp ult <128 x i8> undef, undef + %I8 = icmp ult i8 %arg8, %arg8 + %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ult i16 undef, undef - %V8I16 = icmp ult <8 x i16> undef, undef - %V16I16 = icmp ult <16 x i16> undef, undef - %V32I16 = icmp ult <32 x i16> undef, undef - %V64I16 = icmp ult <64 x i16> undef, undef + %I16 = icmp ult i16 %arg16, %arg16 + %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ult i32 undef, undef - %V4I32 = icmp ult <4 x i32> undef, undef - %V8I32 = icmp ult <8 x i32> undef, undef - %V16I32 = icmp ult <16 x i32> undef, undef - %V32I32 = icmp ult <32 x i32> undef, undef + %I32 = icmp ult i32 %arg32, %arg32 + %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ult i64 undef, undef - %V2I64 = icmp ult <2 x i64> undef, undef - %V4I64 = icmp ult <4 x i64> undef, undef - %V8I64 = icmp ult <8 x i64> undef, undef - %V16I64 = icmp ult <16 x i64> undef, undef + %I64 = icmp ult i64 %arg64, %arg64 + %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ret i32 undef } diff --git a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll index 4969a9649d5c9..89cc6e893b318 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll @@ -15,2306 +15,2306 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 -define i32 @cmp_int_eq(i32 %arg) { +define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_eq' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_eq' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_eq' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_eq' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_eq' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_eq' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_eq' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_eq' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_eq' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp eq i8 undef, undef - %V16I8 = icmp eq <16 x i8> undef, undef - %V32I8 = icmp eq <32 x i8> undef, undef - %V64I8 = icmp eq <64 x i8> undef, undef - %V128I8 = icmp eq <128 x i8> undef, undef + %I8 = icmp eq i8 %arg8, %arg8 + %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp eq i16 undef, undef - %V8I16 = icmp eq <8 x i16> undef, undef - %V16I16 = icmp eq <16 x i16> undef, undef - %V32I16 = icmp eq <32 x i16> undef, undef - %V64I16 = icmp eq <64 x i16> undef, undef + %I16 = icmp eq i16 %arg16, %arg16 + %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp eq i32 undef, undef - %V4I32 = icmp eq <4 x i32> undef, undef - %V8I32 = icmp eq <8 x i32> undef, undef - %V16I32 = icmp eq <16 x i32> undef, undef - %V32I32 = icmp eq <32 x i32> undef, undef + %I32 = icmp eq i32 %arg32, %arg32 + %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp eq i64 undef, undef - %V2I64 = icmp eq <2 x i64> undef, undef - %V4I64 = icmp eq <4 x i64> undef, undef - %V8I64 = icmp eq <8 x i64> undef, undef - %V16I64 = icmp eq <16 x i64> undef, undef + %I64 = icmp eq i64 %arg64, %arg64 + %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ne(i32 %arg) { +define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ne' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ne' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ne' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ne' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ne' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ne' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ne' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ne' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ne' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ne i8 undef, undef - %V16I8 = icmp ne <16 x i8> undef, undef - %V32I8 = icmp ne <32 x i8> undef, undef - %V64I8 = icmp ne <64 x i8> undef, undef - %V128I8 = icmp ne <128 x i8> undef, undef + %I8 = icmp ne i8 %arg8, %arg8 + %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ne i16 undef, undef - %V8I16 = icmp ne <8 x i16> undef, undef - %V16I16 = icmp ne <16 x i16> undef, undef - %V32I16 = icmp ne <32 x i16> undef, undef - %V64I16 = icmp ne <64 x i16> undef, undef + %I16 = icmp ne i16 %arg16, %arg16 + %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ne i32 undef, undef - %V4I32 = icmp ne <4 x i32> undef, undef - %V8I32 = icmp ne <8 x i32> undef, undef - %V16I32 = icmp ne <16 x i32> undef, undef - %V32I32 = icmp ne <32 x i32> undef, undef + %I32 = icmp ne i32 %arg32, %arg32 + %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ne i64 undef, undef - %V2I64 = icmp ne <2 x i64> undef, undef - %V4I64 = icmp ne <4 x i64> undef, undef - %V8I64 = icmp ne <8 x i64> undef, undef - %V16I64 = icmp ne <16 x i64> undef, undef + %I64 = icmp ne i64 %arg64, %arg64 + %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sge(i32 %arg) { +define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sge' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sge' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sge i8 undef, undef - %V16I8 = icmp sge <16 x i8> undef, undef - %V32I8 = icmp sge <32 x i8> undef, undef - %V64I8 = icmp sge <64 x i8> undef, undef - %V128I8 = icmp sge <128 x i8> undef, undef + %I8 = icmp sge i8 %arg8, %arg8 + %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sge i16 undef, undef - %V8I16 = icmp sge <8 x i16> undef, undef - %V16I16 = icmp sge <16 x i16> undef, undef - %V32I16 = icmp sge <32 x i16> undef, undef - %V64I16 = icmp sge <64 x i16> undef, undef + %I16 = icmp sge i16 %arg16, %arg16 + %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sge i32 undef, undef - %V4I32 = icmp sge <4 x i32> undef, undef - %V8I32 = icmp sge <8 x i32> undef, undef - %V16I32 = icmp sge <16 x i32> undef, undef - %V32I32 = icmp sge <32 x i32> undef, undef + %I32 = icmp sge i32 %arg32, %arg32 + %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sge i64 undef, undef - %V2I64 = icmp sge <2 x i64> undef, undef - %V4I64 = icmp sge <4 x i64> undef, undef - %V8I64 = icmp sge <8 x i64> undef, undef - %V16I64 = icmp sge <16 x i64> undef, undef + %I64 = icmp sge i64 %arg64, %arg64 + %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_uge(i32 %arg) { +define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE42-LABEL: 'cmp_int_uge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_uge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_uge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_uge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_uge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_uge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_uge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_uge' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp uge i8 undef, undef - %V16I8 = icmp uge <16 x i8> undef, undef - %V32I8 = icmp uge <32 x i8> undef, undef - %V64I8 = icmp uge <64 x i8> undef, undef - %V128I8 = icmp uge <128 x i8> undef, undef + %I8 = icmp uge i8 %arg8, %arg8 + %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp uge i16 undef, undef - %V8I16 = icmp uge <8 x i16> undef, undef - %V16I16 = icmp uge <16 x i16> undef, undef - %V32I16 = icmp uge <32 x i16> undef, undef - %V64I16 = icmp uge <64 x i16> undef, undef + %I16 = icmp uge i16 %arg16, %arg16 + %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp uge i32 undef, undef - %V4I32 = icmp uge <4 x i32> undef, undef - %V8I32 = icmp uge <8 x i32> undef, undef - %V16I32 = icmp uge <16 x i32> undef, undef - %V32I32 = icmp uge <32 x i32> undef, undef + %I32 = icmp uge i32 %arg32, %arg32 + %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp uge i64 undef, undef - %V2I64 = icmp uge <2 x i64> undef, undef - %V4I64 = icmp uge <4 x i64> undef, undef - %V8I64 = icmp uge <8 x i64> undef, undef - %V16I64 = icmp uge <16 x i64> undef, undef + %I64 = icmp uge i64 %arg64, %arg64 + %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sgt(i32 %arg) { +define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sgt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sgt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sgt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sgt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sgt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sgt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sgt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sgt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sgt' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sgt i8 undef, undef - %V16I8 = icmp sgt <16 x i8> undef, undef - %V32I8 = icmp sgt <32 x i8> undef, undef - %V64I8 = icmp sgt <64 x i8> undef, undef - %V128I8 = icmp sgt <128 x i8> undef, undef + %I8 = icmp sgt i8 %arg8, %arg8 + %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sgt i16 undef, undef - %V8I16 = icmp sgt <8 x i16> undef, undef - %V16I16 = icmp sgt <16 x i16> undef, undef - %V32I16 = icmp sgt <32 x i16> undef, undef - %V64I16 = icmp sgt <64 x i16> undef, undef + %I16 = icmp sgt i16 %arg16, %arg16 + %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sgt i32 undef, undef - %V4I32 = icmp sgt <4 x i32> undef, undef - %V8I32 = icmp sgt <8 x i32> undef, undef - %V16I32 = icmp sgt <16 x i32> undef, undef - %V32I32 = icmp sgt <32 x i32> undef, undef + %I32 = icmp sgt i32 %arg32, %arg32 + %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sgt i64 undef, undef - %V2I64 = icmp sgt <2 x i64> undef, undef - %V4I64 = icmp sgt <4 x i64> undef, undef - %V8I64 = icmp sgt <8 x i64> undef, undef - %V16I64 = icmp sgt <16 x i64> undef, undef + %I64 = icmp sgt i64 %arg64, %arg64 + %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ugt(i32 %arg) { +define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ugt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ugt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ugt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ugt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ugt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ugt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ugt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ugt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ugt' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ugt i8 undef, undef - %V16I8 = icmp ugt <16 x i8> undef, undef - %V32I8 = icmp ugt <32 x i8> undef, undef - %V64I8 = icmp ugt <64 x i8> undef, undef - %V128I8 = icmp ugt <128 x i8> undef, undef + %I8 = icmp ugt i8 %arg8, %arg8 + %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ugt i16 undef, undef - %V8I16 = icmp ugt <8 x i16> undef, undef - %V16I16 = icmp ugt <16 x i16> undef, undef - %V32I16 = icmp ugt <32 x i16> undef, undef - %V64I16 = icmp ugt <64 x i16> undef, undef + %I16 = icmp ugt i16 %arg16, %arg16 + %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ugt i32 undef, undef - %V4I32 = icmp ugt <4 x i32> undef, undef - %V8I32 = icmp ugt <8 x i32> undef, undef - %V16I32 = icmp ugt <16 x i32> undef, undef - %V32I32 = icmp ugt <32 x i32> undef, undef + %I32 = icmp ugt i32 %arg32, %arg32 + %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ugt i64 undef, undef - %V2I64 = icmp ugt <2 x i64> undef, undef - %V4I64 = icmp ugt <4 x i64> undef, undef - %V8I64 = icmp ugt <8 x i64> undef, undef - %V16I64 = icmp ugt <16 x i64> undef, undef + %I64 = icmp ugt i64 %arg64, %arg64 + %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sle(i32 %arg) { +define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sle' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sle' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sle' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sle' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sle' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sle' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sle' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sle' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sle' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sle i8 undef, undef - %V16I8 = icmp sle <16 x i8> undef, undef - %V32I8 = icmp sle <32 x i8> undef, undef - %V64I8 = icmp sle <64 x i8> undef, undef - %V128I8 = icmp sle <128 x i8> undef, undef + %I8 = icmp sle i8 %arg8, %arg8 + %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sle i16 undef, undef - %V8I16 = icmp sle <8 x i16> undef, undef - %V16I16 = icmp sle <16 x i16> undef, undef - %V32I16 = icmp sle <32 x i16> undef, undef - %V64I16 = icmp sle <64 x i16> undef, undef + %I16 = icmp sle i16 %arg16, %arg16 + %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sle i32 undef, undef - %V4I32 = icmp sle <4 x i32> undef, undef - %V8I32 = icmp sle <8 x i32> undef, undef - %V16I32 = icmp sle <16 x i32> undef, undef - %V32I32 = icmp sle <32 x i32> undef, undef + %I32 = icmp sle i32 %arg32, %arg32 + %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sle i64 undef, undef - %V2I64 = icmp sle <2 x i64> undef, undef - %V4I64 = icmp sle <4 x i64> undef, undef - %V8I64 = icmp sle <8 x i64> undef, undef - %V16I64 = icmp sle <16 x i64> undef, undef + %I64 = icmp sle i64 %arg64, %arg64 + %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ule(i32 %arg) { +define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE42-LABEL: 'cmp_int_ule' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ule' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ule' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ule' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ule' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ule' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ule' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ule' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ule i8 undef, undef - %V16I8 = icmp ule <16 x i8> undef, undef - %V32I8 = icmp ule <32 x i8> undef, undef - %V64I8 = icmp ule <64 x i8> undef, undef - %V128I8 = icmp ule <128 x i8> undef, undef + %I8 = icmp ule i8 %arg8, %arg8 + %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ule i16 undef, undef - %V8I16 = icmp ule <8 x i16> undef, undef - %V16I16 = icmp ule <16 x i16> undef, undef - %V32I16 = icmp ule <32 x i16> undef, undef - %V64I16 = icmp ule <64 x i16> undef, undef + %I16 = icmp ule i16 %arg16, %arg16 + %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ule i32 undef, undef - %V4I32 = icmp ule <4 x i32> undef, undef - %V8I32 = icmp ule <8 x i32> undef, undef - %V16I32 = icmp ule <16 x i32> undef, undef - %V32I32 = icmp ule <32 x i32> undef, undef + %I32 = icmp ule i32 %arg32, %arg32 + %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ule i64 undef, undef - %V2I64 = icmp ule <2 x i64> undef, undef - %V4I64 = icmp ule <4 x i64> undef, undef - %V8I64 = icmp ule <8 x i64> undef, undef - %V16I64 = icmp ule <16 x i64> undef, undef + %I64 = icmp ule i64 %arg64, %arg64 + %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_slt(i32 %arg) { +define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_slt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_slt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_slt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_slt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_slt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_slt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_slt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_slt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_slt' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp slt i8 undef, undef - %V16I8 = icmp slt <16 x i8> undef, undef - %V32I8 = icmp slt <32 x i8> undef, undef - %V64I8 = icmp slt <64 x i8> undef, undef - %V128I8 = icmp slt <128 x i8> undef, undef + %I8 = icmp slt i8 %arg8, %arg8 + %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp slt i16 undef, undef - %V8I16 = icmp slt <8 x i16> undef, undef - %V16I16 = icmp slt <16 x i16> undef, undef - %V32I16 = icmp slt <32 x i16> undef, undef - %V64I16 = icmp slt <64 x i16> undef, undef + %I16 = icmp slt i16 %arg16, %arg16 + %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp slt i32 undef, undef - %V4I32 = icmp slt <4 x i32> undef, undef - %V8I32 = icmp slt <8 x i32> undef, undef - %V16I32 = icmp slt <16 x i32> undef, undef - %V32I32 = icmp slt <32 x i32> undef, undef + %I32 = icmp slt i32 %arg32, %arg32 + %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp slt i64 undef, undef - %V2I64 = icmp slt <2 x i64> undef, undef - %V4I64 = icmp slt <4 x i64> undef, undef - %V8I64 = icmp slt <8 x i64> undef, undef - %V16I64 = icmp slt <16 x i64> undef, undef + %I64 = icmp slt i64 %arg64, %arg64 + %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ult(i32 %arg) { +define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ult' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ult' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ult' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ult' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ult' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ult' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ult' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ult' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ult' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ult i8 undef, undef - %V16I8 = icmp ult <16 x i8> undef, undef - %V32I8 = icmp ult <32 x i8> undef, undef - %V64I8 = icmp ult <64 x i8> undef, undef - %V128I8 = icmp ult <128 x i8> undef, undef + %I8 = icmp ult i8 %arg8, %arg8 + %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ult i16 undef, undef - %V8I16 = icmp ult <8 x i16> undef, undef - %V16I16 = icmp ult <16 x i16> undef, undef - %V32I16 = icmp ult <32 x i16> undef, undef - %V64I16 = icmp ult <64 x i16> undef, undef + %I16 = icmp ult i16 %arg16, %arg16 + %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ult i32 undef, undef - %V4I32 = icmp ult <4 x i32> undef, undef - %V8I32 = icmp ult <8 x i32> undef, undef - %V16I32 = icmp ult <16 x i32> undef, undef - %V32I32 = icmp ult <32 x i32> undef, undef + %I32 = icmp ult i32 %arg32, %arg32 + %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ult i64 undef, undef - %V2I64 = icmp ult <2 x i64> undef, undef - %V4I64 = icmp ult <4 x i64> undef, undef - %V8I64 = icmp ult <8 x i64> undef, undef - %V16I64 = icmp ult <16 x i64> undef, undef + %I64 = icmp ult i64 %arg64, %arg64 + %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ret i32 undef } diff --git a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll index 4414f4f1d8ebd..2cac3af5c01b8 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll @@ -15,2076 +15,2076 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 -define i32 @cmp_int_eq(i32 %arg) { +define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_eq' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_eq' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_eq' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_eq' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_eq' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_eq' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_eq' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_eq' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp eq i8 undef, undef - %V16I8 = icmp eq <16 x i8> undef, undef - %V32I8 = icmp eq <32 x i8> undef, undef - %V64I8 = icmp eq <64 x i8> undef, undef - %V128I8 = icmp eq <128 x i8> undef, undef + %I8 = icmp eq i8 %arg8, %arg8 + %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp eq i16 undef, undef - %V8I16 = icmp eq <8 x i16> undef, undef - %V16I16 = icmp eq <16 x i16> undef, undef - %V32I16 = icmp eq <32 x i16> undef, undef - %V64I16 = icmp eq <64 x i16> undef, undef + %I16 = icmp eq i16 %arg16, %arg16 + %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp eq i32 undef, undef - %V4I32 = icmp eq <4 x i32> undef, undef - %V8I32 = icmp eq <8 x i32> undef, undef - %V16I32 = icmp eq <16 x i32> undef, undef - %V32I32 = icmp eq <32 x i32> undef, undef + %I32 = icmp eq i32 %arg32, %arg32 + %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp eq i64 undef, undef - %V2I64 = icmp eq <2 x i64> undef, undef - %V4I64 = icmp eq <4 x i64> undef, undef - %V8I64 = icmp eq <8 x i64> undef, undef - %V16I64 = icmp eq <16 x i64> undef, undef + %I64 = icmp eq i64 %arg64, %arg64 + %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ne(i32 %arg) { +define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ne' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ne' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ne' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ne' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ne' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ne' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ne' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ne' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ne i8 undef, undef - %V16I8 = icmp ne <16 x i8> undef, undef - %V32I8 = icmp ne <32 x i8> undef, undef - %V64I8 = icmp ne <64 x i8> undef, undef - %V128I8 = icmp ne <128 x i8> undef, undef + %I8 = icmp ne i8 %arg8, %arg8 + %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ne i16 undef, undef - %V8I16 = icmp ne <8 x i16> undef, undef - %V16I16 = icmp ne <16 x i16> undef, undef - %V32I16 = icmp ne <32 x i16> undef, undef - %V64I16 = icmp ne <64 x i16> undef, undef + %I16 = icmp ne i16 %arg16, %arg16 + %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ne i32 undef, undef - %V4I32 = icmp ne <4 x i32> undef, undef - %V8I32 = icmp ne <8 x i32> undef, undef - %V16I32 = icmp ne <16 x i32> undef, undef - %V32I32 = icmp ne <32 x i32> undef, undef + %I32 = icmp ne i32 %arg32, %arg32 + %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ne i64 undef, undef - %V2I64 = icmp ne <2 x i64> undef, undef - %V4I64 = icmp ne <4 x i64> undef, undef - %V8I64 = icmp ne <8 x i64> undef, undef - %V16I64 = icmp ne <16 x i64> undef, undef + %I64 = icmp ne i64 %arg64, %arg64 + %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sge(i32 %arg) { +define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sge' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sge i8 undef, undef - %V16I8 = icmp sge <16 x i8> undef, undef - %V32I8 = icmp sge <32 x i8> undef, undef - %V64I8 = icmp sge <64 x i8> undef, undef - %V128I8 = icmp sge <128 x i8> undef, undef + %I8 = icmp sge i8 %arg8, %arg8 + %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sge i16 undef, undef - %V8I16 = icmp sge <8 x i16> undef, undef - %V16I16 = icmp sge <16 x i16> undef, undef - %V32I16 = icmp sge <32 x i16> undef, undef - %V64I16 = icmp sge <64 x i16> undef, undef + %I16 = icmp sge i16 %arg16, %arg16 + %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sge i32 undef, undef - %V4I32 = icmp sge <4 x i32> undef, undef - %V8I32 = icmp sge <8 x i32> undef, undef - %V16I32 = icmp sge <16 x i32> undef, undef - %V32I32 = icmp sge <32 x i32> undef, undef + %I32 = icmp sge i32 %arg32, %arg32 + %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sge i64 undef, undef - %V2I64 = icmp sge <2 x i64> undef, undef - %V4I64 = icmp sge <4 x i64> undef, undef - %V8I64 = icmp sge <8 x i64> undef, undef - %V16I64 = icmp sge <16 x i64> undef, undef + %I64 = icmp sge i64 %arg64, %arg64 + %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_uge(i32 %arg) { +define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE42-LABEL: 'cmp_int_uge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_uge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_uge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_uge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_uge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_uge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_uge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp uge i8 undef, undef - %V16I8 = icmp uge <16 x i8> undef, undef - %V32I8 = icmp uge <32 x i8> undef, undef - %V64I8 = icmp uge <64 x i8> undef, undef - %V128I8 = icmp uge <128 x i8> undef, undef + %I8 = icmp uge i8 %arg8, %arg8 + %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp uge i16 undef, undef - %V8I16 = icmp uge <8 x i16> undef, undef - %V16I16 = icmp uge <16 x i16> undef, undef - %V32I16 = icmp uge <32 x i16> undef, undef - %V64I16 = icmp uge <64 x i16> undef, undef + %I16 = icmp uge i16 %arg16, %arg16 + %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp uge i32 undef, undef - %V4I32 = icmp uge <4 x i32> undef, undef - %V8I32 = icmp uge <8 x i32> undef, undef - %V16I32 = icmp uge <16 x i32> undef, undef - %V32I32 = icmp uge <32 x i32> undef, undef + %I32 = icmp uge i32 %arg32, %arg32 + %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp uge i64 undef, undef - %V2I64 = icmp uge <2 x i64> undef, undef - %V4I64 = icmp uge <4 x i64> undef, undef - %V8I64 = icmp uge <8 x i64> undef, undef - %V16I64 = icmp uge <16 x i64> undef, undef + %I64 = icmp uge i64 %arg64, %arg64 + %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sgt(i32 %arg) { +define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sgt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sgt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sgt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sgt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sgt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sgt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sgt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sgt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sgt i8 undef, undef - %V16I8 = icmp sgt <16 x i8> undef, undef - %V32I8 = icmp sgt <32 x i8> undef, undef - %V64I8 = icmp sgt <64 x i8> undef, undef - %V128I8 = icmp sgt <128 x i8> undef, undef + %I8 = icmp sgt i8 %arg8, %arg8 + %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sgt i16 undef, undef - %V8I16 = icmp sgt <8 x i16> undef, undef - %V16I16 = icmp sgt <16 x i16> undef, undef - %V32I16 = icmp sgt <32 x i16> undef, undef - %V64I16 = icmp sgt <64 x i16> undef, undef + %I16 = icmp sgt i16 %arg16, %arg16 + %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sgt i32 undef, undef - %V4I32 = icmp sgt <4 x i32> undef, undef - %V8I32 = icmp sgt <8 x i32> undef, undef - %V16I32 = icmp sgt <16 x i32> undef, undef - %V32I32 = icmp sgt <32 x i32> undef, undef + %I32 = icmp sgt i32 %arg32, %arg32 + %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sgt i64 undef, undef - %V2I64 = icmp sgt <2 x i64> undef, undef - %V4I64 = icmp sgt <4 x i64> undef, undef - %V8I64 = icmp sgt <8 x i64> undef, undef - %V16I64 = icmp sgt <16 x i64> undef, undef + %I64 = icmp sgt i64 %arg64, %arg64 + %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ugt(i32 %arg) { +define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ugt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ugt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ugt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ugt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ugt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ugt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ugt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ugt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ugt i8 undef, undef - %V16I8 = icmp ugt <16 x i8> undef, undef - %V32I8 = icmp ugt <32 x i8> undef, undef - %V64I8 = icmp ugt <64 x i8> undef, undef - %V128I8 = icmp ugt <128 x i8> undef, undef + %I8 = icmp ugt i8 %arg8, %arg8 + %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ugt i16 undef, undef - %V8I16 = icmp ugt <8 x i16> undef, undef - %V16I16 = icmp ugt <16 x i16> undef, undef - %V32I16 = icmp ugt <32 x i16> undef, undef - %V64I16 = icmp ugt <64 x i16> undef, undef + %I16 = icmp ugt i16 %arg16, %arg16 + %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ugt i32 undef, undef - %V4I32 = icmp ugt <4 x i32> undef, undef - %V8I32 = icmp ugt <8 x i32> undef, undef - %V16I32 = icmp ugt <16 x i32> undef, undef - %V32I32 = icmp ugt <32 x i32> undef, undef + %I32 = icmp ugt i32 %arg32, %arg32 + %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ugt i64 undef, undef - %V2I64 = icmp ugt <2 x i64> undef, undef - %V4I64 = icmp ugt <4 x i64> undef, undef - %V8I64 = icmp ugt <8 x i64> undef, undef - %V16I64 = icmp ugt <16 x i64> undef, undef + %I64 = icmp ugt i64 %arg64, %arg64 + %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sle(i32 %arg) { +define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sle' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sle' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sle' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sle' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sle' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sle' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sle' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sle' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp sle i8 undef, undef - %V16I8 = icmp sle <16 x i8> undef, undef - %V32I8 = icmp sle <32 x i8> undef, undef - %V64I8 = icmp sle <64 x i8> undef, undef - %V128I8 = icmp sle <128 x i8> undef, undef + %I8 = icmp sle i8 %arg8, %arg8 + %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sle i16 undef, undef - %V8I16 = icmp sle <8 x i16> undef, undef - %V16I16 = icmp sle <16 x i16> undef, undef - %V32I16 = icmp sle <32 x i16> undef, undef - %V64I16 = icmp sle <64 x i16> undef, undef + %I16 = icmp sle i16 %arg16, %arg16 + %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sle i32 undef, undef - %V4I32 = icmp sle <4 x i32> undef, undef - %V8I32 = icmp sle <8 x i32> undef, undef - %V16I32 = icmp sle <16 x i32> undef, undef - %V32I32 = icmp sle <32 x i32> undef, undef + %I32 = icmp sle i32 %arg32, %arg32 + %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sle i64 undef, undef - %V2I64 = icmp sle <2 x i64> undef, undef - %V4I64 = icmp sle <4 x i64> undef, undef - %V8I64 = icmp sle <8 x i64> undef, undef - %V16I64 = icmp sle <16 x i64> undef, undef + %I64 = icmp sle i64 %arg64, %arg64 + %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ule(i32 %arg) { +define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE42-LABEL: 'cmp_int_ule' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ule' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ule' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ule' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ule' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ule' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ule' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ule i8 undef, undef - %V16I8 = icmp ule <16 x i8> undef, undef - %V32I8 = icmp ule <32 x i8> undef, undef - %V64I8 = icmp ule <64 x i8> undef, undef - %V128I8 = icmp ule <128 x i8> undef, undef + %I8 = icmp ule i8 %arg8, %arg8 + %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ule i16 undef, undef - %V8I16 = icmp ule <8 x i16> undef, undef - %V16I16 = icmp ule <16 x i16> undef, undef - %V32I16 = icmp ule <32 x i16> undef, undef - %V64I16 = icmp ule <64 x i16> undef, undef + %I16 = icmp ule i16 %arg16, %arg16 + %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ule i32 undef, undef - %V4I32 = icmp ule <4 x i32> undef, undef - %V8I32 = icmp ule <8 x i32> undef, undef - %V16I32 = icmp ule <16 x i32> undef, undef - %V32I32 = icmp ule <32 x i32> undef, undef + %I32 = icmp ule i32 %arg32, %arg32 + %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ule i64 undef, undef - %V2I64 = icmp ule <2 x i64> undef, undef - %V4I64 = icmp ule <4 x i64> undef, undef - %V8I64 = icmp ule <8 x i64> undef, undef - %V16I64 = icmp ule <16 x i64> undef, undef + %I64 = icmp ule i64 %arg64, %arg64 + %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_slt(i32 %arg) { +define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_slt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_slt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_slt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_slt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_slt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_slt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_slt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_slt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp slt i8 undef, undef - %V16I8 = icmp slt <16 x i8> undef, undef - %V32I8 = icmp slt <32 x i8> undef, undef - %V64I8 = icmp slt <64 x i8> undef, undef - %V128I8 = icmp slt <128 x i8> undef, undef + %I8 = icmp slt i8 %arg8, %arg8 + %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp slt i16 undef, undef - %V8I16 = icmp slt <8 x i16> undef, undef - %V16I16 = icmp slt <16 x i16> undef, undef - %V32I16 = icmp slt <32 x i16> undef, undef - %V64I16 = icmp slt <64 x i16> undef, undef + %I16 = icmp slt i16 %arg16, %arg16 + %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp slt i32 undef, undef - %V4I32 = icmp slt <4 x i32> undef, undef - %V8I32 = icmp slt <8 x i32> undef, undef - %V16I32 = icmp slt <16 x i32> undef, undef - %V32I32 = icmp slt <32 x i32> undef, undef + %I32 = icmp slt i32 %arg32, %arg32 + %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp slt i64 undef, undef - %V2I64 = icmp slt <2 x i64> undef, undef - %V4I64 = icmp slt <4 x i64> undef, undef - %V8I64 = icmp slt <8 x i64> undef, undef - %V16I64 = icmp slt <16 x i64> undef, undef + %I64 = icmp slt i64 %arg64, %arg64 + %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ult(i32 %arg) { +define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ult' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ult' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ult' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ult' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ult' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ult' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ult' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ult' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %I8 = icmp ult i8 undef, undef - %V16I8 = icmp ult <16 x i8> undef, undef - %V32I8 = icmp ult <32 x i8> undef, undef - %V64I8 = icmp ult <64 x i8> undef, undef - %V128I8 = icmp ult <128 x i8> undef, undef + %I8 = icmp ult i8 %arg8, %arg8 + %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ult i16 undef, undef - %V8I16 = icmp ult <8 x i16> undef, undef - %V16I16 = icmp ult <16 x i16> undef, undef - %V32I16 = icmp ult <32 x i16> undef, undef - %V64I16 = icmp ult <64 x i16> undef, undef + %I16 = icmp ult i16 %arg16, %arg16 + %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ult i32 undef, undef - %V4I32 = icmp ult <4 x i32> undef, undef - %V8I32 = icmp ult <8 x i32> undef, undef - %V16I32 = icmp ult <16 x i32> undef, undef - %V32I32 = icmp ult <32 x i32> undef, undef + %I32 = icmp ult i32 %arg32, %arg32 + %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ult i64 undef, undef - %V2I64 = icmp ult <2 x i64> undef, undef - %V4I64 = icmp ult <4 x i64> undef, undef - %V8I64 = icmp ult <8 x i64> undef, undef - %V16I64 = icmp ult <16 x i64> undef, undef + %I64 = icmp ult i64 %arg64, %arg64 + %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ret i32 undef } diff --git a/llvm/test/Analysis/CostModel/X86/icmp.ll b/llvm/test/Analysis/CostModel/X86/icmp.ll index bac40044b783e..b1d13ffa605ed 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp.ll @@ -15,3042 +15,3042 @@ ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 -define i32 @cmp_int_eq(i32 %arg) { +define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_eq' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_eq' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_eq' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_eq' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_eq' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_eq' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_eq' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_eq' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_eq' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_eq' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_eq' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_eq' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp eq i8 undef, undef - %V16I8 = icmp eq <16 x i8> undef, undef - %V32I8 = icmp eq <32 x i8> undef, undef - %V64I8 = icmp eq <64 x i8> undef, undef - %V128I8 = icmp eq <128 x i8> undef, undef + %I8 = icmp eq i8 %arg8, %arg8 + %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp eq i16 undef, undef - %V8I16 = icmp eq <8 x i16> undef, undef - %V16I16 = icmp eq <16 x i16> undef, undef - %V32I16 = icmp eq <32 x i16> undef, undef - %V64I16 = icmp eq <64 x i16> undef, undef + %I16 = icmp eq i16 %arg16, %arg16 + %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp eq i32 undef, undef - %V4I32 = icmp eq <4 x i32> undef, undef - %V8I32 = icmp eq <8 x i32> undef, undef - %V16I32 = icmp eq <16 x i32> undef, undef - %V32I32 = icmp eq <32 x i32> undef, undef + %I32 = icmp eq i32 %arg32, %arg32 + %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp eq i64 undef, undef - %V2I64 = icmp eq <2 x i64> undef, undef - %V4I64 = icmp eq <4 x i64> undef, undef - %V8I64 = icmp eq <8 x i64> undef, undef - %V16I64 = icmp eq <16 x i64> undef, undef + %I64 = icmp eq i64 %arg64, %arg64 + %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ne(i32 %arg) { +define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ne' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ne' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ne' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ne' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ne' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ne' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ne' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ne' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ne' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ne' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ne' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ne' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp ne i8 undef, undef - %V16I8 = icmp ne <16 x i8> undef, undef - %V32I8 = icmp ne <32 x i8> undef, undef - %V64I8 = icmp ne <64 x i8> undef, undef - %V128I8 = icmp ne <128 x i8> undef, undef + %I8 = icmp ne i8 %arg8, %arg8 + %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ne i16 undef, undef - %V8I16 = icmp ne <8 x i16> undef, undef - %V16I16 = icmp ne <16 x i16> undef, undef - %V32I16 = icmp ne <32 x i16> undef, undef - %V64I16 = icmp ne <64 x i16> undef, undef + %I16 = icmp ne i16 %arg16, %arg16 + %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ne i32 undef, undef - %V4I32 = icmp ne <4 x i32> undef, undef - %V8I32 = icmp ne <8 x i32> undef, undef - %V16I32 = icmp ne <16 x i32> undef, undef - %V32I32 = icmp ne <32 x i32> undef, undef + %I32 = icmp ne i32 %arg32, %arg32 + %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ne i64 undef, undef - %V2I64 = icmp ne <2 x i64> undef, undef - %V4I64 = icmp ne <4 x i64> undef, undef - %V8I64 = icmp ne <8 x i64> undef, undef - %V16I64 = icmp ne <16 x i64> undef, undef + %I64 = icmp ne i64 %arg64, %arg64 + %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sge(i32 %arg) { +define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sge' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_sge' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_sge' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_sge' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sge' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp sge i8 undef, undef - %V16I8 = icmp sge <16 x i8> undef, undef - %V32I8 = icmp sge <32 x i8> undef, undef - %V64I8 = icmp sge <64 x i8> undef, undef - %V128I8 = icmp sge <128 x i8> undef, undef + %I8 = icmp sge i8 %arg8, %arg8 + %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sge i16 undef, undef - %V8I16 = icmp sge <8 x i16> undef, undef - %V16I16 = icmp sge <16 x i16> undef, undef - %V32I16 = icmp sge <32 x i16> undef, undef - %V64I16 = icmp sge <64 x i16> undef, undef + %I16 = icmp sge i16 %arg16, %arg16 + %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sge i32 undef, undef - %V4I32 = icmp sge <4 x i32> undef, undef - %V8I32 = icmp sge <8 x i32> undef, undef - %V16I32 = icmp sge <16 x i32> undef, undef - %V32I32 = icmp sge <32 x i32> undef, undef + %I32 = icmp sge i32 %arg32, %arg32 + %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sge i64 undef, undef - %V2I64 = icmp sge <2 x i64> undef, undef - %V4I64 = icmp sge <4 x i64> undef, undef - %V8I64 = icmp sge <8 x i64> undef, undef - %V16I64 = icmp sge <16 x i64> undef, undef + %I64 = icmp sge i64 %arg64, %arg64 + %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_uge(i32 %arg) { +define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_uge' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_uge' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_uge' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_uge' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_uge' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_uge' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_uge' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_uge' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_uge' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_uge' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_uge' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_uge' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp uge i8 undef, undef - %V16I8 = icmp uge <16 x i8> undef, undef - %V32I8 = icmp uge <32 x i8> undef, undef - %V64I8 = icmp uge <64 x i8> undef, undef - %V128I8 = icmp uge <128 x i8> undef, undef + %I8 = icmp uge i8 %arg8, %arg8 + %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp uge i16 undef, undef - %V8I16 = icmp uge <8 x i16> undef, undef - %V16I16 = icmp uge <16 x i16> undef, undef - %V32I16 = icmp uge <32 x i16> undef, undef - %V64I16 = icmp uge <64 x i16> undef, undef + %I16 = icmp uge i16 %arg16, %arg16 + %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp uge i32 undef, undef - %V4I32 = icmp uge <4 x i32> undef, undef - %V8I32 = icmp uge <8 x i32> undef, undef - %V16I32 = icmp uge <16 x i32> undef, undef - %V32I32 = icmp uge <32 x i32> undef, undef + %I32 = icmp uge i32 %arg32, %arg32 + %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp uge i64 undef, undef - %V2I64 = icmp uge <2 x i64> undef, undef - %V4I64 = icmp uge <4 x i64> undef, undef - %V8I64 = icmp uge <8 x i64> undef, undef - %V16I64 = icmp uge <16 x i64> undef, undef + %I64 = icmp uge i64 %arg64, %arg64 + %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sgt(i32 %arg) { +define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sgt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_sgt' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_sgt' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_sgt' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sgt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sgt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sgt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sgt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sgt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sgt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sgt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sgt' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp sgt i8 undef, undef - %V16I8 = icmp sgt <16 x i8> undef, undef - %V32I8 = icmp sgt <32 x i8> undef, undef - %V64I8 = icmp sgt <64 x i8> undef, undef - %V128I8 = icmp sgt <128 x i8> undef, undef + %I8 = icmp sgt i8 %arg8, %arg8 + %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sgt i16 undef, undef - %V8I16 = icmp sgt <8 x i16> undef, undef - %V16I16 = icmp sgt <16 x i16> undef, undef - %V32I16 = icmp sgt <32 x i16> undef, undef - %V64I16 = icmp sgt <64 x i16> undef, undef + %I16 = icmp sgt i16 %arg16, %arg16 + %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sgt i32 undef, undef - %V4I32 = icmp sgt <4 x i32> undef, undef - %V8I32 = icmp sgt <8 x i32> undef, undef - %V16I32 = icmp sgt <16 x i32> undef, undef - %V32I32 = icmp sgt <32 x i32> undef, undef + %I32 = icmp sgt i32 %arg32, %arg32 + %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sgt i64 undef, undef - %V2I64 = icmp sgt <2 x i64> undef, undef - %V4I64 = icmp sgt <4 x i64> undef, undef - %V8I64 = icmp sgt <8 x i64> undef, undef - %V16I64 = icmp sgt <16 x i64> undef, undef + %I64 = icmp sgt i64 %arg64, %arg64 + %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ugt(i32 %arg) { +define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ugt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ugt' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ugt' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ugt' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ugt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ugt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ugt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ugt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ugt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ugt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ugt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ugt' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp ugt i8 undef, undef - %V16I8 = icmp ugt <16 x i8> undef, undef - %V32I8 = icmp ugt <32 x i8> undef, undef - %V64I8 = icmp ugt <64 x i8> undef, undef - %V128I8 = icmp ugt <128 x i8> undef, undef + %I8 = icmp ugt i8 %arg8, %arg8 + %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ugt i16 undef, undef - %V8I16 = icmp ugt <8 x i16> undef, undef - %V16I16 = icmp ugt <16 x i16> undef, undef - %V32I16 = icmp ugt <32 x i16> undef, undef - %V64I16 = icmp ugt <64 x i16> undef, undef + %I16 = icmp ugt i16 %arg16, %arg16 + %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ugt i32 undef, undef - %V4I32 = icmp ugt <4 x i32> undef, undef - %V8I32 = icmp ugt <8 x i32> undef, undef - %V16I32 = icmp ugt <16 x i32> undef, undef - %V32I32 = icmp ugt <32 x i32> undef, undef + %I32 = icmp ugt i32 %arg32, %arg32 + %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ugt i64 undef, undef - %V2I64 = icmp ugt <2 x i64> undef, undef - %V4I64 = icmp ugt <4 x i64> undef, undef - %V8I64 = icmp ugt <8 x i64> undef, undef - %V16I64 = icmp ugt <16 x i64> undef, undef + %I64 = icmp ugt i64 %arg64, %arg64 + %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_sle(i32 %arg) { +define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sle' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_sle' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_sle' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_sle' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sle' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sle' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sle' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sle' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_sle' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_sle' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_sle' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sle' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp sle i8 undef, undef - %V16I8 = icmp sle <16 x i8> undef, undef - %V32I8 = icmp sle <32 x i8> undef, undef - %V64I8 = icmp sle <64 x i8> undef, undef - %V128I8 = icmp sle <128 x i8> undef, undef + %I8 = icmp sle i8 %arg8, %arg8 + %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp sle i16 undef, undef - %V8I16 = icmp sle <8 x i16> undef, undef - %V16I16 = icmp sle <16 x i16> undef, undef - %V32I16 = icmp sle <32 x i16> undef, undef - %V64I16 = icmp sle <64 x i16> undef, undef + %I16 = icmp sle i16 %arg16, %arg16 + %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp sle i32 undef, undef - %V4I32 = icmp sle <4 x i32> undef, undef - %V8I32 = icmp sle <8 x i32> undef, undef - %V16I32 = icmp sle <16 x i32> undef, undef - %V32I32 = icmp sle <32 x i32> undef, undef + %I32 = icmp sle i32 %arg32, %arg32 + %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp sle i64 undef, undef - %V2I64 = icmp sle <2 x i64> undef, undef - %V4I64 = icmp sle <4 x i64> undef, undef - %V8I64 = icmp sle <8 x i64> undef, undef - %V16I64 = icmp sle <16 x i64> undef, undef + %I64 = icmp sle i64 %arg64, %arg64 + %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ule(i32 %arg) { +define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ule' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ule' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ule' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ule' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ule' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ule' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ule' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ule' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ule' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ule' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ule' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ule' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp ule i8 undef, undef - %V16I8 = icmp ule <16 x i8> undef, undef - %V32I8 = icmp ule <32 x i8> undef, undef - %V64I8 = icmp ule <64 x i8> undef, undef - %V128I8 = icmp ule <128 x i8> undef, undef + %I8 = icmp ule i8 %arg8, %arg8 + %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ule i16 undef, undef - %V8I16 = icmp ule <8 x i16> undef, undef - %V16I16 = icmp ule <16 x i16> undef, undef - %V32I16 = icmp ule <32 x i16> undef, undef - %V64I16 = icmp ule <64 x i16> undef, undef + %I16 = icmp ule i16 %arg16, %arg16 + %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ule i32 undef, undef - %V4I32 = icmp ule <4 x i32> undef, undef - %V8I32 = icmp ule <8 x i32> undef, undef - %V16I32 = icmp ule <16 x i32> undef, undef - %V32I32 = icmp ule <32 x i32> undef, undef + %I32 = icmp ule i32 %arg32, %arg32 + %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ule i64 undef, undef - %V2I64 = icmp ule <2 x i64> undef, undef - %V4I64 = icmp ule <4 x i64> undef, undef - %V8I64 = icmp ule <8 x i64> undef, undef - %V16I64 = icmp ule <16 x i64> undef, undef + %I64 = icmp ule i64 %arg64, %arg64 + %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_slt(i32 %arg) { +define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_slt' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_slt' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_slt' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_slt' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_slt' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_slt' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_slt' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_slt' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_slt' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_slt' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_slt' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_slt' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp slt i8 undef, undef - %V16I8 = icmp slt <16 x i8> undef, undef - %V32I8 = icmp slt <32 x i8> undef, undef - %V64I8 = icmp slt <64 x i8> undef, undef - %V128I8 = icmp slt <128 x i8> undef, undef + %I8 = icmp slt i8 %arg8, %arg8 + %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp slt i16 undef, undef - %V8I16 = icmp slt <8 x i16> undef, undef - %V16I16 = icmp slt <16 x i16> undef, undef - %V32I16 = icmp slt <32 x i16> undef, undef - %V64I16 = icmp slt <64 x i16> undef, undef + %I16 = icmp slt i16 %arg16, %arg16 + %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp slt i32 undef, undef - %V4I32 = icmp slt <4 x i32> undef, undef - %V8I32 = icmp slt <8 x i32> undef, undef - %V16I32 = icmp slt <16 x i32> undef, undef - %V32I32 = icmp slt <32 x i32> undef, undef + %I32 = icmp slt i32 %arg32, %arg32 + %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp slt i64 undef, undef - %V2I64 = icmp slt <2 x i64> undef, undef - %V4I64 = icmp slt <4 x i64> undef, undef - %V8I64 = icmp slt <8 x i64> undef, undef - %V16I64 = icmp slt <16 x i64> undef, undef + %I64 = icmp slt i64 %arg64, %arg64 + %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64 ret i32 undef } -define i32 @cmp_int_ult(i32 %arg) { +define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ult' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ult' -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ult' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ult' -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ult' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ult' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ult' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ult' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'cmp_int_ult' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX1-LABEL: 'cmp_int_ult' -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; XOPAVX2-LABEL: 'cmp_int_ult' -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ult' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %I8 = icmp ult i8 undef, undef - %V16I8 = icmp ult <16 x i8> undef, undef - %V32I8 = icmp ult <32 x i8> undef, undef - %V64I8 = icmp ult <64 x i8> undef, undef - %V128I8 = icmp ult <128 x i8> undef, undef + %I8 = icmp ult i8 %arg8, %arg8 + %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8 + %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8 + %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8 + %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8 - %I16 = icmp ult i16 undef, undef - %V8I16 = icmp ult <8 x i16> undef, undef - %V16I16 = icmp ult <16 x i16> undef, undef - %V32I16 = icmp ult <32 x i16> undef, undef - %V64I16 = icmp ult <64 x i16> undef, undef + %I16 = icmp ult i16 %arg16, %arg16 + %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16 + %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16 + %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16 + %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16 - %I32 = icmp ult i32 undef, undef - %V4I32 = icmp ult <4 x i32> undef, undef - %V8I32 = icmp ult <8 x i32> undef, undef - %V16I32 = icmp ult <16 x i32> undef, undef - %V32I32 = icmp ult <32 x i32> undef, undef + %I32 = icmp ult i32 %arg32, %arg32 + %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32 + %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32 + %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32 + %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32 - %I64 = icmp ult i64 undef, undef - %V2I64 = icmp ult <2 x i64> undef, undef - %V4I64 = icmp ult <4 x i64> undef, undef - %V8I64 = icmp ult <8 x i64> undef, undef - %V16I64 = icmp ult <16 x i64> undef, undef + %I64 = icmp ult i64 %arg64, %arg64 + %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64 + %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64 + %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64 + %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64 ret i32 undef } From a0869b14cde9ed71bb4323c8717f59ee1b1e79bf Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 14:35:50 +0000 Subject: [PATCH 103/351] [CostModel][X86] Fix expanded CTPOP i8 costs Updated to match #79989 / 9410019ac977141bc73aee19690b5896ded59219 --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 +- llvm/test/Analysis/CostModel/X86/ctpop-codesize.ll | 2 +- llvm/test/Analysis/CostModel/X86/ctpop-sizelatency.ll | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index be1a094830899..f91e13f997f78 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4000,7 +4000,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, - { ISD::CTPOP, MVT::i8, { 7, 6, 13, 13 } }, + { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } }, { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, diff --git a/llvm/test/Analysis/CostModel/X86/ctpop-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctpop-codesize.ll index 75adeee09197e..2e417efd52d12 100644 --- a/llvm/test/Analysis/CostModel/X86/ctpop-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/ctpop-codesize.ll @@ -57,7 +57,7 @@ define i16 @var_ctpop_i16(i16 %a) { define i8 @var_ctpop_i8(i8 %a) { ; NOPOPCNT-LABEL: 'var_ctpop_i8' -; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) +; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctpop ; ; POPCNT-LABEL: 'var_ctpop_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctpop-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctpop-sizelatency.ll index b46cfe15569e6..0e629c23b71ca 100644 --- a/llvm/test/Analysis/CostModel/X86/ctpop-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/ctpop-sizelatency.ll @@ -57,7 +57,7 @@ define i16 @var_ctpop_i16(i16 %a) { define i8 @var_ctpop_i8(i8 %a) { ; NOPOPCNT-LABEL: 'var_ctpop_i8' -; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) +; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctpop ; ; POPCNT-LABEL: 'var_ctpop_i8' From 4d4af15c3fb671ed9f7eef9f29ebd6fde15618df Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 21 Feb 2024 15:55:42 +0100 Subject: [PATCH 104/351] [NFC][flang][OpenMP] Split `DataSharing` and `Clause` processors (#81973) This started as an experiment to reduce the compilation time of iterating over `Lower/OpenMP.cpp` a bit since it is too slow at the moment. Trying to do that, I split the `DataSharingProcessor`, `ReductionProcessor`, and `ClauseProcessor` into their own files and extracted some shared code into a util file. All of these new `.h/.cpp` files as well as `OpenMP.cpp` are now under a `Lower/OpenMP/` directory. This resulted is a slightly better organization of the OpenMP lowering code and hence opening this NFC. As for the compilation time, this unfortunately does not affect it much (it shaves off a few seconds of `OpenMP.cpp` compilation) since from what I learned the bottleneck is in `DirectivesCommon.h` and `PFTBuilder.h` which both consume a lot of time in template instantiation it seems. --- flang/lib/Lower/CMakeLists.txt | 6 +- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 880 +++++++ flang/lib/Lower/OpenMP/ClauseProcessor.h | 305 +++ .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 350 +++ flang/lib/Lower/OpenMP/DataSharingProcessor.h | 89 + flang/lib/Lower/{ => OpenMP}/OpenMP.cpp | 2040 +---------------- flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 431 ++++ flang/lib/Lower/OpenMP/ReductionProcessor.h | 138 ++ flang/lib/Lower/OpenMP/Utils.cpp | 99 + flang/lib/Lower/OpenMP/Utils.h | 68 + 10 files changed, 2371 insertions(+), 2035 deletions(-) create mode 100644 flang/lib/Lower/OpenMP/ClauseProcessor.cpp create mode 100644 flang/lib/Lower/OpenMP/ClauseProcessor.h create mode 100644 flang/lib/Lower/OpenMP/DataSharingProcessor.cpp create mode 100644 flang/lib/Lower/OpenMP/DataSharingProcessor.h rename flang/lib/Lower/{ => OpenMP}/OpenMP.cpp (55%) create mode 100644 flang/lib/Lower/OpenMP/ReductionProcessor.cpp create mode 100644 flang/lib/Lower/OpenMP/ReductionProcessor.h create mode 100644 flang/lib/Lower/OpenMP/Utils.cpp create mode 100644 flang/lib/Lower/OpenMP/Utils.h diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt index b13d415e02f1d..5577a60f1daea 100644 --- a/flang/lib/Lower/CMakeLists.txt +++ b/flang/lib/Lower/CMakeLists.txt @@ -24,7 +24,11 @@ add_flang_library(FortranLower LoweringOptions.cpp Mangler.cpp OpenACC.cpp - OpenMP.cpp + OpenMP/ClauseProcessor.cpp + OpenMP/DataSharingProcessor.cpp + OpenMP/OpenMP.cpp + OpenMP/ReductionProcessor.cpp + OpenMP/Utils.cpp PFTBuilder.cpp Runtime.cpp SymbolMap.cpp diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp new file mode 100644 index 0000000000000..4e3951492fb65 --- /dev/null +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -0,0 +1,880 @@ +//===-- ClauseProcessor.cpp -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#include "ClauseProcessor.h" + +#include "flang/Lower/PFTBuilder.h" +#include "flang/Parser/tools.h" +#include "flang/Semantics/tools.h" + +namespace Fortran { +namespace lower { +namespace omp { + +/// Check for unsupported map operand types. +static void checkMapType(mlir::Location location, mlir::Type type) { + if (auto refType = type.dyn_cast()) + type = refType.getElementType(); + if (auto boxType = type.dyn_cast_or_null()) + if (!boxType.getElementType().isa()) + TODO(location, "OMPD_target_data MapOperand BoxType"); +} + +static mlir::omp::ScheduleModifier +translateScheduleModifier(const Fortran::parser::OmpScheduleModifierType &m) { + switch (m.v) { + case Fortran::parser::OmpScheduleModifierType::ModType::Monotonic: + return mlir::omp::ScheduleModifier::monotonic; + case Fortran::parser::OmpScheduleModifierType::ModType::Nonmonotonic: + return mlir::omp::ScheduleModifier::nonmonotonic; + case Fortran::parser::OmpScheduleModifierType::ModType::Simd: + return mlir::omp::ScheduleModifier::simd; + } + return mlir::omp::ScheduleModifier::none; +} + +static mlir::omp::ScheduleModifier +getScheduleModifier(const Fortran::parser::OmpScheduleClause &x) { + const auto &modifier = + std::get>(x.t); + // The input may have the modifier any order, so we look for one that isn't + // SIMD. If modifier is not set at all, fall down to the bottom and return + // "none". + if (modifier) { + const auto &modType1 = + std::get(modifier->t); + if (modType1.v.v == + Fortran::parser::OmpScheduleModifierType::ModType::Simd) { + const auto &modType2 = std::get< + std::optional>( + modifier->t); + if (modType2 && + modType2->v.v != + Fortran::parser::OmpScheduleModifierType::ModType::Simd) + return translateScheduleModifier(modType2->v); + + return mlir::omp::ScheduleModifier::none; + } + + return translateScheduleModifier(modType1.v); + } + return mlir::omp::ScheduleModifier::none; +} + +static mlir::omp::ScheduleModifier +getSimdModifier(const Fortran::parser::OmpScheduleClause &x) { + const auto &modifier = + std::get>(x.t); + // Either of the two possible modifiers in the input can be the SIMD modifier, + // so look in either one, and return simd if we find one. Not found = return + // "none". + if (modifier) { + const auto &modType1 = + std::get(modifier->t); + if (modType1.v.v == Fortran::parser::OmpScheduleModifierType::ModType::Simd) + return mlir::omp::ScheduleModifier::simd; + + const auto &modType2 = std::get< + std::optional>( + modifier->t); + if (modType2 && modType2->v.v == + Fortran::parser::OmpScheduleModifierType::ModType::Simd) + return mlir::omp::ScheduleModifier::simd; + } + return mlir::omp::ScheduleModifier::none; +} + +static void +genAllocateClause(Fortran::lower::AbstractConverter &converter, + const Fortran::parser::OmpAllocateClause &ompAllocateClause, + llvm::SmallVectorImpl &allocatorOperands, + llvm::SmallVectorImpl &allocateOperands) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::Location currentLocation = converter.getCurrentLocation(); + Fortran::lower::StatementContext stmtCtx; + + mlir::Value allocatorOperand; + const Fortran::parser::OmpObjectList &ompObjectList = + std::get(ompAllocateClause.t); + const auto &allocateModifier = std::get< + std::optional>( + ompAllocateClause.t); + + // If the allocate modifier is present, check if we only use the allocator + // submodifier. ALIGN in this context is unimplemented + const bool onlyAllocator = + allocateModifier && + std::holds_alternative< + Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>( + allocateModifier->u); + + if (allocateModifier && !onlyAllocator) { + TODO(currentLocation, "OmpAllocateClause ALIGN modifier"); + } + + // Check if allocate clause has allocator specified. If so, add it + // to list of allocators, otherwise, add default allocator to + // list of allocators. + if (onlyAllocator) { + const auto &allocatorValue = std::get< + Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>( + allocateModifier->u); + allocatorOperand = fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(allocatorValue.v), stmtCtx)); + allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(), + allocatorOperand); + } else { + allocatorOperand = firOpBuilder.createIntegerConstant( + currentLocation, firOpBuilder.getI32Type(), 1); + allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(), + allocatorOperand); + } + genObjectList(ompObjectList, converter, allocateOperands); +} + +static mlir::omp::ClauseProcBindKindAttr genProcBindKindAttr( + fir::FirOpBuilder &firOpBuilder, + const Fortran::parser::OmpClause::ProcBind *procBindClause) { + mlir::omp::ClauseProcBindKind procBindKind; + switch (procBindClause->v.v) { + case Fortran::parser::OmpProcBindClause::Type::Master: + procBindKind = mlir::omp::ClauseProcBindKind::Master; + break; + case Fortran::parser::OmpProcBindClause::Type::Close: + procBindKind = mlir::omp::ClauseProcBindKind::Close; + break; + case Fortran::parser::OmpProcBindClause::Type::Spread: + procBindKind = mlir::omp::ClauseProcBindKind::Spread; + break; + case Fortran::parser::OmpProcBindClause::Type::Primary: + procBindKind = mlir::omp::ClauseProcBindKind::Primary; + break; + } + return mlir::omp::ClauseProcBindKindAttr::get(firOpBuilder.getContext(), + procBindKind); +} + +static mlir::omp::ClauseTaskDependAttr +genDependKindAttr(fir::FirOpBuilder &firOpBuilder, + const Fortran::parser::OmpClause::Depend *dependClause) { + mlir::omp::ClauseTaskDepend pbKind; + switch ( + std::get( + std::get(dependClause->v.u) + .t) + .v) { + case Fortran::parser::OmpDependenceType::Type::In: + pbKind = mlir::omp::ClauseTaskDepend::taskdependin; + break; + case Fortran::parser::OmpDependenceType::Type::Out: + pbKind = mlir::omp::ClauseTaskDepend::taskdependout; + break; + case Fortran::parser::OmpDependenceType::Type::Inout: + pbKind = mlir::omp::ClauseTaskDepend::taskdependinout; + break; + default: + llvm_unreachable("unknown parser task dependence type"); + break; + } + return mlir::omp::ClauseTaskDependAttr::get(firOpBuilder.getContext(), + pbKind); +} + +static mlir::Value getIfClauseOperand( + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::OmpClause::If *ifClause, + Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName, + mlir::Location clauseLocation) { + // Only consider the clause if it's intended for the given directive. + auto &directive = std::get< + std::optional>( + ifClause->v.t); + if (directive && directive.value() != directiveName) + return nullptr; + + Fortran::lower::StatementContext stmtCtx; + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + auto &expr = std::get(ifClause->v.t); + mlir::Value ifVal = fir::getBase( + converter.genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)); + return firOpBuilder.createConvert(clauseLocation, firOpBuilder.getI1Type(), + ifVal); +} + +static void +addUseDeviceClause(Fortran::lower::AbstractConverter &converter, + const Fortran::parser::OmpObjectList &useDeviceClause, + llvm::SmallVectorImpl &operands, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl + &useDeviceSymbols) { + genObjectList(useDeviceClause, converter, operands); + for (mlir::Value &operand : operands) { + checkMapType(operand.getLoc(), operand.getType()); + useDeviceTypes.push_back(operand.getType()); + useDeviceLocs.push_back(operand.getLoc()); + } + for (const Fortran::parser::OmpObject &ompObject : useDeviceClause.v) { + Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); + useDeviceSymbols.push_back(sym); + } +} + +//===----------------------------------------------------------------------===// +// ClauseProcessor unique clauses +//===----------------------------------------------------------------------===// + +bool ClauseProcessor::processCollapse( + mlir::Location currentLocation, Fortran::lower::pft::Evaluation &eval, + llvm::SmallVectorImpl &lowerBound, + llvm::SmallVectorImpl &upperBound, + llvm::SmallVectorImpl &step, + llvm::SmallVectorImpl &iv, + std::size_t &loopVarTypeSize) const { + bool found = false; + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + + // Collect the loops to collapse. + Fortran::lower::pft::Evaluation *doConstructEval = + &eval.getFirstNestedEvaluation(); + if (doConstructEval->getIf() + ->IsDoConcurrent()) { + TODO(currentLocation, "Do Concurrent in Worksharing loop construct"); + } + + std::int64_t collapseValue = 1l; + if (auto *collapseClause = findUniqueClause()) { + const auto *expr = Fortran::semantics::GetExpr(collapseClause->v); + collapseValue = Fortran::evaluate::ToInt64(*expr).value(); + found = true; + } + + loopVarTypeSize = 0; + do { + Fortran::lower::pft::Evaluation *doLoop = + &doConstructEval->getFirstNestedEvaluation(); + auto *doStmt = doLoop->getIf(); + assert(doStmt && "Expected do loop to be in the nested evaluation"); + const auto &loopControl = + std::get>(doStmt->t); + const Fortran::parser::LoopControl::Bounds *bounds = + std::get_if(&loopControl->u); + assert(bounds && "Expected bounds for worksharing do loop"); + Fortran::lower::StatementContext stmtCtx; + lowerBound.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); + upperBound.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); + if (bounds->step) { + step.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); + } else { // If `step` is not present, assume it as `1`. + step.push_back(firOpBuilder.createIntegerConstant( + currentLocation, firOpBuilder.getIntegerType(32), 1)); + } + iv.push_back(bounds->name.thing.symbol); + loopVarTypeSize = std::max(loopVarTypeSize, + bounds->name.thing.symbol->GetUltimate().size()); + collapseValue--; + doConstructEval = + &*std::next(doConstructEval->getNestedEvaluations().begin()); + } while (collapseValue > 0); + + return found; +} + +bool ClauseProcessor::processDefault() const { + if (auto *defaultClause = findUniqueClause()) { + // Private, Firstprivate, Shared, None + switch (defaultClause->v.v) { + case Fortran::parser::OmpDefaultClause::Type::Shared: + case Fortran::parser::OmpDefaultClause::Type::None: + // Default clause with shared or none do not require any handling since + // Shared is the default behavior in the IR and None is only required + // for semantic checks. + break; + case Fortran::parser::OmpDefaultClause::Type::Private: + // TODO Support default(private) + break; + case Fortran::parser::OmpDefaultClause::Type::Firstprivate: + // TODO Support default(firstprivate) + break; + } + return true; + } + return false; +} + +bool ClauseProcessor::processDevice(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const { + const Fortran::parser::CharBlock *source = nullptr; + if (auto *deviceClause = findUniqueClause(&source)) { + mlir::Location clauseLocation = converter.genLocation(*source); + if (auto deviceModifier = std::get< + std::optional>( + deviceClause->v.t)) { + if (deviceModifier == + Fortran::parser::OmpDeviceClause::DeviceModifier::Ancestor) { + TODO(clauseLocation, "OMPD_target Device Modifier Ancestor"); + } + } + if (const auto *deviceExpr = Fortran::semantics::GetExpr( + std::get(deviceClause->v.t))) { + result = fir::getBase(converter.genExprValue(*deviceExpr, stmtCtx)); + } + return true; + } + return false; +} + +bool ClauseProcessor::processDeviceType( + mlir::omp::DeclareTargetDeviceType &result) const { + if (auto *deviceTypeClause = findUniqueClause()) { + // Case: declare target ... device_type(any | host | nohost) + switch (deviceTypeClause->v.v) { + case Fortran::parser::OmpDeviceTypeClause::Type::Nohost: + result = mlir::omp::DeclareTargetDeviceType::nohost; + break; + case Fortran::parser::OmpDeviceTypeClause::Type::Host: + result = mlir::omp::DeclareTargetDeviceType::host; + break; + case Fortran::parser::OmpDeviceTypeClause::Type::Any: + result = mlir::omp::DeclareTargetDeviceType::any; + break; + } + return true; + } + return false; +} + +bool ClauseProcessor::processFinal(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const { + const Fortran::parser::CharBlock *source = nullptr; + if (auto *finalClause = findUniqueClause(&source)) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::Location clauseLocation = converter.genLocation(*source); + + mlir::Value finalVal = fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(finalClause->v), stmtCtx)); + result = firOpBuilder.createConvert(clauseLocation, + firOpBuilder.getI1Type(), finalVal); + return true; + } + return false; +} + +bool ClauseProcessor::processHint(mlir::IntegerAttr &result) const { + if (auto *hintClause = findUniqueClause()) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + const auto *expr = Fortran::semantics::GetExpr(hintClause->v); + int64_t hintValue = *Fortran::evaluate::ToInt64(*expr); + result = firOpBuilder.getI64IntegerAttr(hintValue); + return true; + } + return false; +} + +bool ClauseProcessor::processMergeable(mlir::UnitAttr &result) const { + return markClauseOccurrence(result); +} + +bool ClauseProcessor::processNowait(mlir::UnitAttr &result) const { + return markClauseOccurrence(result); +} + +bool ClauseProcessor::processNumTeams(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const { + // TODO Get lower and upper bounds for num_teams when parser is updated to + // accept both. + if (auto *numTeamsClause = findUniqueClause()) { + result = fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(numTeamsClause->v), stmtCtx)); + return true; + } + return false; +} + +bool ClauseProcessor::processNumThreads( + Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const { + if (auto *numThreadsClause = findUniqueClause()) { + // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`. + result = fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx)); + return true; + } + return false; +} + +bool ClauseProcessor::processOrdered(mlir::IntegerAttr &result) const { + if (auto *orderedClause = findUniqueClause()) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + int64_t orderedClauseValue = 0l; + if (orderedClause->v.has_value()) { + const auto *expr = Fortran::semantics::GetExpr(orderedClause->v); + orderedClauseValue = *Fortran::evaluate::ToInt64(*expr); + } + result = firOpBuilder.getI64IntegerAttr(orderedClauseValue); + return true; + } + return false; +} + +bool ClauseProcessor::processPriority(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const { + if (auto *priorityClause = findUniqueClause()) { + result = fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(priorityClause->v), stmtCtx)); + return true; + } + return false; +} + +bool ClauseProcessor::processProcBind( + mlir::omp::ClauseProcBindKindAttr &result) const { + if (auto *procBindClause = findUniqueClause()) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + result = genProcBindKindAttr(firOpBuilder, procBindClause); + return true; + } + return false; +} + +bool ClauseProcessor::processSafelen(mlir::IntegerAttr &result) const { + if (auto *safelenClause = findUniqueClause()) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + const auto *expr = Fortran::semantics::GetExpr(safelenClause->v); + const std::optional safelenVal = + Fortran::evaluate::ToInt64(*expr); + result = firOpBuilder.getI64IntegerAttr(*safelenVal); + return true; + } + return false; +} + +bool ClauseProcessor::processSchedule( + mlir::omp::ClauseScheduleKindAttr &valAttr, + mlir::omp::ScheduleModifierAttr &modifierAttr, + mlir::UnitAttr &simdModifierAttr) const { + if (auto *scheduleClause = findUniqueClause()) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::MLIRContext *context = firOpBuilder.getContext(); + const Fortran::parser::OmpScheduleClause &scheduleType = scheduleClause->v; + const auto &scheduleClauseKind = + std::get( + scheduleType.t); + + mlir::omp::ClauseScheduleKind scheduleKind; + switch (scheduleClauseKind) { + case Fortran::parser::OmpScheduleClause::ScheduleType::Static: + scheduleKind = mlir::omp::ClauseScheduleKind::Static; + break; + case Fortran::parser::OmpScheduleClause::ScheduleType::Dynamic: + scheduleKind = mlir::omp::ClauseScheduleKind::Dynamic; + break; + case Fortran::parser::OmpScheduleClause::ScheduleType::Guided: + scheduleKind = mlir::omp::ClauseScheduleKind::Guided; + break; + case Fortran::parser::OmpScheduleClause::ScheduleType::Auto: + scheduleKind = mlir::omp::ClauseScheduleKind::Auto; + break; + case Fortran::parser::OmpScheduleClause::ScheduleType::Runtime: + scheduleKind = mlir::omp::ClauseScheduleKind::Runtime; + break; + } + + mlir::omp::ScheduleModifier scheduleModifier = + getScheduleModifier(scheduleClause->v); + + if (scheduleModifier != mlir::omp::ScheduleModifier::none) + modifierAttr = + mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier); + + if (getSimdModifier(scheduleClause->v) != mlir::omp::ScheduleModifier::none) + simdModifierAttr = firOpBuilder.getUnitAttr(); + + valAttr = mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind); + return true; + } + return false; +} + +bool ClauseProcessor::processScheduleChunk( + Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const { + if (auto *scheduleClause = findUniqueClause()) { + if (const auto &chunkExpr = + std::get>( + scheduleClause->v.t)) { + if (const auto *expr = Fortran::semantics::GetExpr(*chunkExpr)) { + result = fir::getBase(converter.genExprValue(*expr, stmtCtx)); + } + } + return true; + } + return false; +} + +bool ClauseProcessor::processSimdlen(mlir::IntegerAttr &result) const { + if (auto *simdlenClause = findUniqueClause()) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + const auto *expr = Fortran::semantics::GetExpr(simdlenClause->v); + const std::optional simdlenVal = + Fortran::evaluate::ToInt64(*expr); + result = firOpBuilder.getI64IntegerAttr(*simdlenVal); + return true; + } + return false; +} + +bool ClauseProcessor::processThreadLimit( + Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const { + if (auto *threadLmtClause = findUniqueClause()) { + result = fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(threadLmtClause->v), stmtCtx)); + return true; + } + return false; +} + +bool ClauseProcessor::processUntied(mlir::UnitAttr &result) const { + return markClauseOccurrence(result); +} + +//===----------------------------------------------------------------------===// +// ClauseProcessor repeatable clauses +//===----------------------------------------------------------------------===// + +bool ClauseProcessor::processAllocate( + llvm::SmallVectorImpl &allocatorOperands, + llvm::SmallVectorImpl &allocateOperands) const { + return findRepeatableClause( + [&](const ClauseTy::Allocate *allocateClause, + const Fortran::parser::CharBlock &) { + genAllocateClause(converter, allocateClause->v, allocatorOperands, + allocateOperands); + }); +} + +bool ClauseProcessor::processCopyin() const { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::OpBuilder::InsertPoint insPt = firOpBuilder.saveInsertionPoint(); + firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); + auto checkAndCopyHostAssociateVar = + [&](Fortran::semantics::Symbol *sym, + mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) { + assert(sym->has() && + "No host-association found"); + if (converter.isPresentShallowLookup(*sym)) + converter.copyHostAssociateVar(*sym, copyAssignIP); + }; + bool hasCopyin = findRepeatableClause( + [&](const ClauseTy::Copyin *copyinClause, + const Fortran::parser::CharBlock &) { + const Fortran::parser::OmpObjectList &ompObjectList = copyinClause->v; + for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) { + Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); + if (const auto *commonDetails = + sym->detailsIf()) { + for (const auto &mem : commonDetails->objects()) + checkAndCopyHostAssociateVar(&*mem, &insPt); + break; + } + if (Fortran::semantics::IsAllocatableOrObjectPointer( + &sym->GetUltimate())) + TODO(converter.getCurrentLocation(), + "pointer or allocatable variables in Copyin clause"); + assert(sym->has() && + "No host-association found"); + checkAndCopyHostAssociateVar(sym); + } + }); + + // [OMP 5.0, 2.19.6.1] The copy is done after the team is formed and prior to + // the execution of the associated structured block. Emit implicit barrier to + // synchronize threads and avoid data races on propagation master's thread + // values of threadprivate variables to local instances of that variables of + // all other implicit threads. + if (hasCopyin) + firOpBuilder.create(converter.getCurrentLocation()); + firOpBuilder.restoreInsertionPoint(insPt); + return hasCopyin; +} + +bool ClauseProcessor::processDepend( + llvm::SmallVectorImpl &dependTypeOperands, + llvm::SmallVectorImpl &dependOperands) const { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + + return findRepeatableClause( + [&](const ClauseTy::Depend *dependClause, + const Fortran::parser::CharBlock &) { + const std::list &depVal = + std::get>( + std::get( + dependClause->v.u) + .t); + mlir::omp::ClauseTaskDependAttr dependTypeOperand = + genDependKindAttr(firOpBuilder, dependClause); + dependTypeOperands.insert(dependTypeOperands.end(), depVal.size(), + dependTypeOperand); + for (const Fortran::parser::Designator &ompObject : depVal) { + Fortran::semantics::Symbol *sym = nullptr; + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::DataRef &designator) { + if (const Fortran::parser::Name *name = + std::get_if(&designator.u)) { + sym = name->symbol; + } else if (std::get_if>( + &designator.u)) { + TODO(converter.getCurrentLocation(), + "array sections not supported for task depend"); + } + }, + [&](const Fortran::parser::Substring &designator) { + TODO(converter.getCurrentLocation(), + "substring not supported for task depend"); + }}, + (ompObject).u); + const mlir::Value variable = converter.getSymbolAddress(*sym); + dependOperands.push_back(variable); + } + }); +} + +bool ClauseProcessor::processIf( + Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName, + mlir::Value &result) const { + bool found = false; + findRepeatableClause( + [&](const ClauseTy::If *ifClause, + const Fortran::parser::CharBlock &source) { + mlir::Location clauseLocation = converter.genLocation(source); + mlir::Value operand = getIfClauseOperand(converter, ifClause, + directiveName, clauseLocation); + // Assume that, at most, a single 'if' clause will be applicable to the + // given directive. + if (operand) { + result = operand; + found = true; + } + }); + return found; +} + +bool ClauseProcessor::processLink( + llvm::SmallVectorImpl &result) const { + return findRepeatableClause( + [&](const ClauseTy::Link *linkClause, + const Fortran::parser::CharBlock &) { + // Case: declare target link(var1, var2)... + gatherFuncAndVarSyms( + linkClause->v, mlir::omp::DeclareTargetCaptureClause::link, result); + }); +} + +mlir::omp::MapInfoOp +createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, + mlir::SmallVector bounds, + mlir::SmallVector members, uint64_t mapType, + mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, + bool isVal) { + if (auto boxTy = baseAddr.getType().dyn_cast()) { + baseAddr = builder.create(loc, baseAddr); + retTy = baseAddr.getType(); + } + + mlir::TypeAttr varType = mlir::TypeAttr::get( + llvm::cast(retTy).getElementType()); + + mlir::omp::MapInfoOp op = builder.create( + loc, retTy, baseAddr, varType, varPtrPtr, members, bounds, + builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), + builder.getAttr(mapCaptureType), + builder.getStringAttr(name)); + + return op; +} + +bool ClauseProcessor::processMap( + mlir::Location currentLocation, const llvm::omp::Directive &directive, + Fortran::lower::StatementContext &stmtCtx, + llvm::SmallVectorImpl &mapOperands, + llvm::SmallVectorImpl *mapSymTypes, + llvm::SmallVectorImpl *mapSymLocs, + llvm::SmallVectorImpl *mapSymbols) + const { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + return findRepeatableClause( + [&](const ClauseTy::Map *mapClause, + const Fortran::parser::CharBlock &source) { + mlir::Location clauseLocation = converter.genLocation(source); + const auto &oMapType = + std::get>( + mapClause->v.t); + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + // If the map type is specified, then process it else Tofrom is the + // default. + if (oMapType) { + const Fortran::parser::OmpMapType::Type &mapType = + std::get(oMapType->t); + switch (mapType) { + case Fortran::parser::OmpMapType::Type::To: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + break; + case Fortran::parser::OmpMapType::Type::From: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + break; + case Fortran::parser::OmpMapType::Type::Tofrom: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + break; + case Fortran::parser::OmpMapType::Type::Alloc: + case Fortran::parser::OmpMapType::Type::Release: + // alloc and release is the default map_type for the Target Data + // Ops, i.e. if no bits for map_type is supplied then alloc/release + // is implicitly assumed based on the target directive. Default + // value for Target Data and Enter Data is alloc and for Exit Data + // it is release. + break; + case Fortran::parser::OmpMapType::Type::Delete: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; + } + + if (std::get>( + oMapType->t)) + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; + } else { + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + } + + for (const Fortran::parser::OmpObject &ompObject : + std::get(mapClause->v.t).v) { + llvm::SmallVector bounds; + std::stringstream asFortran; + + Fortran::lower::AddrAndBoundsInfo info = + Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::OmpObject, mlir::omp::DataBoundsOp, + mlir::omp::DataBoundsType>( + converter, firOpBuilder, semaCtx, stmtCtx, ompObject, + clauseLocation, asFortran, bounds, treatIndexAsSection); + + auto origSymbol = + converter.getSymbolAddress(*getOmpObjectSymbol(ompObject)); + mlir::Value symAddr = info.addr; + if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) + symAddr = origSymbol; + + // Explicit map captures are captured ByRef by default, + // optimisation passes may alter this to ByCopy or other capture + // types to optimise + mlir::Value mapOp = createMapInfoOp( + firOpBuilder, clauseLocation, symAddr, mlir::Value{}, + asFortran.str(), bounds, {}, + static_cast< + std::underlying_type_t>( + mapTypeBits), + mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); + + mapOperands.push_back(mapOp); + if (mapSymTypes) + mapSymTypes->push_back(symAddr.getType()); + if (mapSymLocs) + mapSymLocs->push_back(symAddr.getLoc()); + + if (mapSymbols) + mapSymbols->push_back(getOmpObjectSymbol(ompObject)); + } + }); +} + +bool ClauseProcessor::processReduction( + mlir::Location currentLocation, + llvm::SmallVectorImpl &reductionVars, + llvm::SmallVectorImpl &reductionDeclSymbols, + llvm::SmallVectorImpl *reductionSymbols) + const { + return findRepeatableClause( + [&](const ClauseTy::Reduction *reductionClause, + const Fortran::parser::CharBlock &) { + ReductionProcessor rp; + rp.addReductionDecl(currentLocation, converter, reductionClause->v, + reductionVars, reductionDeclSymbols, + reductionSymbols); + }); +} + +bool ClauseProcessor::processSectionsReduction( + mlir::Location currentLocation) const { + return findRepeatableClause( + [&](const ClauseTy::Reduction *, const Fortran::parser::CharBlock &) { + TODO(currentLocation, "OMPC_Reduction"); + }); +} + +bool ClauseProcessor::processTo( + llvm::SmallVectorImpl &result) const { + return findRepeatableClause( + [&](const ClauseTy::To *toClause, const Fortran::parser::CharBlock &) { + // Case: declare target to(func, var1, var2)... + gatherFuncAndVarSyms(toClause->v, + mlir::omp::DeclareTargetCaptureClause::to, result); + }); +} + +bool ClauseProcessor::processEnter( + llvm::SmallVectorImpl &result) const { + return findRepeatableClause( + [&](const ClauseTy::Enter *enterClause, + const Fortran::parser::CharBlock &) { + // Case: declare target enter(func, var1, var2)... + gatherFuncAndVarSyms(enterClause->v, + mlir::omp::DeclareTargetCaptureClause::enter, + result); + }); +} + +bool ClauseProcessor::processUseDeviceAddr( + llvm::SmallVectorImpl &operands, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl &useDeviceSymbols) + const { + return findRepeatableClause( + [&](const ClauseTy::UseDeviceAddr *devAddrClause, + const Fortran::parser::CharBlock &) { + addUseDeviceClause(converter, devAddrClause->v, operands, + useDeviceTypes, useDeviceLocs, useDeviceSymbols); + }); +} + +bool ClauseProcessor::processUseDevicePtr( + llvm::SmallVectorImpl &operands, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl &useDeviceSymbols) + const { + return findRepeatableClause( + [&](const ClauseTy::UseDevicePtr *devPtrClause, + const Fortran::parser::CharBlock &) { + addUseDeviceClause(converter, devPtrClause->v, operands, useDeviceTypes, + useDeviceLocs, useDeviceSymbols); + }); +} +} // namespace omp +} // namespace lower +} // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h new file mode 100644 index 0000000000000..312255112605e --- /dev/null +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -0,0 +1,305 @@ +//===-- Lower/OpenMP/ClauseProcessor.h --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// +#ifndef FORTRAN_LOWER_CLAUASEPROCESSOR_H +#define FORTRAN_LOWER_CLAUASEPROCESSOR_H + +#include "DirectivesCommon.h" +#include "ReductionProcessor.h" +#include "Utils.h" +#include "flang/Lower/AbstractConverter.h" +#include "flang/Lower/Bridge.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Parser/dump-parse-tree.h" +#include "flang/Parser/parse-tree.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" + +namespace fir { +class FirOpBuilder; +} // namespace fir + +namespace Fortran { +namespace lower { +namespace omp { + +/// Class that handles the processing of OpenMP clauses. +/// +/// Its `process()` methods perform MLIR code generation for their +/// corresponding clause if it is present in the clause list. Otherwise, they +/// will return `false` to signal that the clause was not found. +/// +/// The intended use is of this class is to move clause processing outside of +/// construct processing, since the same clauses can appear attached to +/// different constructs and constructs can be combined, so that code +/// duplication is minimized. +/// +/// Each construct-lowering function only calls the `process()` +/// methods that relate to clauses that can impact the lowering of that +/// construct. +class ClauseProcessor { + using ClauseTy = Fortran::parser::OmpClause; + +public: + ClauseProcessor(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses) + : converter(converter), semaCtx(semaCtx), clauses(clauses) {} + + // 'Unique' clauses: They can appear at most once in the clause list. + bool + processCollapse(mlir::Location currentLocation, + Fortran::lower::pft::Evaluation &eval, + llvm::SmallVectorImpl &lowerBound, + llvm::SmallVectorImpl &upperBound, + llvm::SmallVectorImpl &step, + llvm::SmallVectorImpl &iv, + std::size_t &loopVarTypeSize) const; + bool processDefault() const; + bool processDevice(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processDeviceType(mlir::omp::DeclareTargetDeviceType &result) const; + bool processFinal(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processHint(mlir::IntegerAttr &result) const; + bool processMergeable(mlir::UnitAttr &result) const; + bool processNowait(mlir::UnitAttr &result) const; + bool processNumTeams(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processNumThreads(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processOrdered(mlir::IntegerAttr &result) const; + bool processPriority(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processProcBind(mlir::omp::ClauseProcBindKindAttr &result) const; + bool processSafelen(mlir::IntegerAttr &result) const; + bool processSchedule(mlir::omp::ClauseScheduleKindAttr &valAttr, + mlir::omp::ScheduleModifierAttr &modifierAttr, + mlir::UnitAttr &simdModifierAttr) const; + bool processScheduleChunk(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processSimdlen(mlir::IntegerAttr &result) const; + bool processThreadLimit(Fortran::lower::StatementContext &stmtCtx, + mlir::Value &result) const; + bool processUntied(mlir::UnitAttr &result) const; + + // 'Repeatable' clauses: They can appear multiple times in the clause list. + bool + processAllocate(llvm::SmallVectorImpl &allocatorOperands, + llvm::SmallVectorImpl &allocateOperands) const; + bool processCopyin() const; + bool processDepend(llvm::SmallVectorImpl &dependTypeOperands, + llvm::SmallVectorImpl &dependOperands) const; + bool + processEnter(llvm::SmallVectorImpl &result) const; + bool + processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName, + mlir::Value &result) const; + bool + processLink(llvm::SmallVectorImpl &result) const; + + // This method is used to process a map clause. + // The optional parameters - mapSymTypes, mapSymLocs & mapSymbols are used to + // store the original type, location and Fortran symbol for the map operands. + // They may be used later on to create the block_arguments for some of the + // target directives that require it. + bool processMap(mlir::Location currentLocation, + const llvm::omp::Directive &directive, + Fortran::lower::StatementContext &stmtCtx, + llvm::SmallVectorImpl &mapOperands, + llvm::SmallVectorImpl *mapSymTypes = nullptr, + llvm::SmallVectorImpl *mapSymLocs = nullptr, + llvm::SmallVectorImpl + *mapSymbols = nullptr) const; + bool + processReduction(mlir::Location currentLocation, + llvm::SmallVectorImpl &reductionVars, + llvm::SmallVectorImpl &reductionDeclSymbols, + llvm::SmallVectorImpl + *reductionSymbols = nullptr) const; + bool processSectionsReduction(mlir::Location currentLocation) const; + bool processTo(llvm::SmallVectorImpl &result) const; + bool + processUseDeviceAddr(llvm::SmallVectorImpl &operands, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl + &useDeviceSymbols) const; + bool + processUseDevicePtr(llvm::SmallVectorImpl &operands, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl + &useDeviceSymbols) const; + + template + bool processMotionClauses(Fortran::lower::StatementContext &stmtCtx, + llvm::SmallVectorImpl &mapOperands); + + // Call this method for these clauses that should be supported but are not + // implemented yet. It triggers a compilation error if any of the given + // clauses is found. + template + void processTODO(mlir::Location currentLocation, + llvm::omp::Directive directive) const; + +private: + using ClauseIterator = std::list::const_iterator; + + /// Utility to find a clause within a range in the clause list. + template + static ClauseIterator findClause(ClauseIterator begin, ClauseIterator end); + + /// Return the first instance of the given clause found in the clause list or + /// `nullptr` if not present. If more than one instance is expected, use + /// `findRepeatableClause` instead. + template + const T * + findUniqueClause(const Fortran::parser::CharBlock **source = nullptr) const; + + /// Call `callbackFn` for each occurrence of the given clause. Return `true` + /// if at least one instance was found. + template + bool findRepeatableClause( + std::function + callbackFn) const; + + /// Set the `result` to a new `mlir::UnitAttr` if the clause is present. + template + bool markClauseOccurrence(mlir::UnitAttr &result) const; + + Fortran::lower::AbstractConverter &converter; + Fortran::semantics::SemanticsContext &semaCtx; + const Fortran::parser::OmpClauseList &clauses; +}; + +template +bool ClauseProcessor::processMotionClauses( + Fortran::lower::StatementContext &stmtCtx, + llvm::SmallVectorImpl &mapOperands) { + return findRepeatableClause( + [&](const T *motionClause, const Fortran::parser::CharBlock &source) { + mlir::Location clauseLocation = converter.genLocation(source); + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + + static_assert(std::is_same_v || + std::is_same_v); + + // TODO Support motion modifiers: present, mapper, iterator. + constexpr llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + std::is_same_v + ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO + : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + + for (const Fortran::parser::OmpObject &ompObject : motionClause->v.v) { + llvm::SmallVector bounds; + std::stringstream asFortran; + Fortran::lower::AddrAndBoundsInfo info = + Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::OmpObject, mlir::omp::DataBoundsOp, + mlir::omp::DataBoundsType>( + converter, firOpBuilder, semaCtx, stmtCtx, ompObject, + clauseLocation, asFortran, bounds, treatIndexAsSection); + + auto origSymbol = + converter.getSymbolAddress(*getOmpObjectSymbol(ompObject)); + mlir::Value symAddr = info.addr; + if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) + symAddr = origSymbol; + + // Explicit map captures are captured ByRef by default, + // optimisation passes may alter this to ByCopy or other capture + // types to optimise + mlir::Value mapOp = createMapInfoOp( + firOpBuilder, clauseLocation, symAddr, mlir::Value{}, + asFortran.str(), bounds, {}, + static_cast< + std::underlying_type_t>( + mapTypeBits), + mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); + + mapOperands.push_back(mapOp); + } + }); +} + +template +void ClauseProcessor::processTODO(mlir::Location currentLocation, + llvm::omp::Directive directive) const { + auto checkUnhandledClause = [&](const auto *x) { + if (!x) + return; + TODO(currentLocation, + "Unhandled clause " + + llvm::StringRef(Fortran::parser::ParseTreeDumper::GetNodeName(*x)) + .upper() + + " in " + llvm::omp::getOpenMPDirectiveName(directive).upper() + + " construct"); + }; + + for (ClauseIterator it = clauses.v.begin(); it != clauses.v.end(); ++it) + (checkUnhandledClause(std::get_if(&it->u)), ...); +} + +template +ClauseProcessor::ClauseIterator +ClauseProcessor::findClause(ClauseIterator begin, ClauseIterator end) { + for (ClauseIterator it = begin; it != end; ++it) { + if (std::get_if(&it->u)) + return it; + } + + return end; +} + +template +const T *ClauseProcessor::findUniqueClause( + const Fortran::parser::CharBlock **source) const { + ClauseIterator it = findClause(clauses.v.begin(), clauses.v.end()); + if (it != clauses.v.end()) { + if (source) + *source = &it->source; + return &std::get(it->u); + } + return nullptr; +} + +template +bool ClauseProcessor::findRepeatableClause( + std::function + callbackFn) const { + bool found = false; + ClauseIterator nextIt, endIt = clauses.v.end(); + for (ClauseIterator it = clauses.v.begin(); it != endIt; it = nextIt) { + nextIt = findClause(it, endIt); + + if (nextIt != endIt) { + callbackFn(&std::get(nextIt->u), nextIt->source); + found = true; + ++nextIt; + } + } + return found; +} + +template +bool ClauseProcessor::markClauseOccurrence(mlir::UnitAttr &result) const { + if (findUniqueClause()) { + result = converter.getFirOpBuilder().getUnitAttr(); + return true; + } + return false; +} + +} // namespace omp +} // namespace lower +} // namespace Fortran + +#endif // FORTRAN_LOWER_CLAUASEPROCESSOR_H diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp new file mode 100644 index 0000000000000..136bda0b582ee --- /dev/null +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -0,0 +1,350 @@ +//===-- DataSharingProcessor.cpp --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#include "DataSharingProcessor.h" + +#include "Utils.h" +#include "flang/Lower/PFTBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Semantics/tools.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" + +namespace Fortran { +namespace lower { +namespace omp { + +void DataSharingProcessor::processStep1() { + collectSymbolsForPrivatization(); + collectDefaultSymbols(); + privatize(); + defaultPrivatize(); + insertBarrier(); +} + +void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) { + insPt = firOpBuilder.saveInsertionPoint(); + copyLastPrivatize(op); + firOpBuilder.restoreInsertionPoint(insPt); + + if (isLoop) { + // push deallocs out of the loop + firOpBuilder.setInsertionPointAfter(op); + insertDeallocs(); + } else { + // insert dummy instruction to mark the insertion position + mlir::Value undefMarker = firOpBuilder.create( + op->getLoc(), firOpBuilder.getIndexType()); + insertDeallocs(); + firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp()); + } +} + +void DataSharingProcessor::insertDeallocs() { + for (const Fortran::semantics::Symbol *sym : privatizedSymbols) + if (Fortran::semantics::IsAllocatable(sym->GetUltimate())) { + converter.createHostAssociateVarCloneDealloc(*sym); + } +} + +void DataSharingProcessor::cloneSymbol(const Fortran::semantics::Symbol *sym) { + // Privatization for symbols which are pre-determined (like loop index + // variables) happen separately, for everything else privatize here. + if (sym->test(Fortran::semantics::Symbol::Flag::OmpPreDetermined)) + return; + bool success = converter.createHostAssociateVarClone(*sym); + (void)success; + assert(success && "Privatization failed due to existing binding"); +} + +void DataSharingProcessor::copyFirstPrivateSymbol( + const Fortran::semantics::Symbol *sym) { + if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate)) + converter.copyHostAssociateVar(*sym); +} + +void DataSharingProcessor::copyLastPrivateSymbol( + const Fortran::semantics::Symbol *sym, + [[maybe_unused]] mlir::OpBuilder::InsertPoint *lastPrivIP) { + if (sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) + converter.copyHostAssociateVar(*sym, lastPrivIP); +} + +void DataSharingProcessor::collectOmpObjectListSymbol( + const Fortran::parser::OmpObjectList &ompObjectList, + llvm::SetVector &symbolSet) { + for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) { + Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); + symbolSet.insert(sym); + } +} + +void DataSharingProcessor::collectSymbolsForPrivatization() { + bool hasCollapse = false; + for (const Fortran::parser::OmpClause &clause : opClauseList.v) { + if (const auto &privateClause = + std::get_if(&clause.u)) { + collectOmpObjectListSymbol(privateClause->v, privatizedSymbols); + } else if (const auto &firstPrivateClause = + std::get_if( + &clause.u)) { + collectOmpObjectListSymbol(firstPrivateClause->v, privatizedSymbols); + } else if (const auto &lastPrivateClause = + std::get_if( + &clause.u)) { + collectOmpObjectListSymbol(lastPrivateClause->v, privatizedSymbols); + hasLastPrivateOp = true; + } else if (std::get_if(&clause.u)) { + hasCollapse = true; + } + } + + if (hasCollapse && hasLastPrivateOp) + TODO(converter.getCurrentLocation(), "Collapse clause with lastprivate"); +} + +bool DataSharingProcessor::needBarrier() { + for (const Fortran::semantics::Symbol *sym : privatizedSymbols) { + if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) && + sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) + return true; + } + return false; +} + +void DataSharingProcessor::insertBarrier() { + // Emit implicit barrier to synchronize threads and avoid data races on + // initialization of firstprivate variables and post-update of lastprivate + // variables. + // FIXME: Emit barrier for lastprivate clause when 'sections' directive has + // 'nowait' clause. Otherwise, emit barrier when 'sections' directive has + // both firstprivate and lastprivate clause. + // Emit implicit barrier for linear clause. Maybe on somewhere else. + if (needBarrier()) + firOpBuilder.create(converter.getCurrentLocation()); +} + +void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { + bool cmpCreated = false; + mlir::OpBuilder::InsertPoint localInsPt = firOpBuilder.saveInsertionPoint(); + for (const Fortran::parser::OmpClause &clause : opClauseList.v) { + if (std::get_if(&clause.u)) { + // TODO: Add lastprivate support for simd construct + if (mlir::isa(op)) { + if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) { + // For `omp.sections`, lastprivatized variables occur in + // lexically final `omp.section` operation. The following FIR + // shall be generated for the same: + // + // omp.sections lastprivate(...) { + // omp.section {...} + // omp.section {...} + // omp.section { + // fir.allocate for `private`/`firstprivate` + // + // fir.if %true { + // ^%lpv_update_blk + // } + // } + // } + // + // To keep code consistency while handling privatization + // through this control flow, add a `fir.if` operation + // that always evaluates to true, in order to create + // a dedicated sub-region in `omp.section` where + // lastprivate FIR can reside. Later canonicalizations + // will optimize away this operation. + if (!eval.lowerAsUnstructured()) { + auto ifOp = firOpBuilder.create( + op->getLoc(), + firOpBuilder.createIntegerConstant( + op->getLoc(), firOpBuilder.getIntegerType(1), 0x1), + /*else*/ false); + firOpBuilder.setInsertionPointToStart( + &ifOp.getThenRegion().front()); + + const Fortran::parser::OpenMPConstruct *parentOmpConstruct = + eval.parentConstruct->getIf(); + assert(parentOmpConstruct && + "Expected a valid enclosing OpenMP construct"); + const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct = + std::get_if( + &parentOmpConstruct->u); + assert(sectionsConstruct && + "Expected an enclosing omp.sections construct"); + const Fortran::parser::OmpClauseList §ionsEndClauseList = + std::get( + std::get( + sectionsConstruct->t) + .t); + for (const Fortran::parser::OmpClause &otherClause : + sectionsEndClauseList.v) + if (std::get_if( + &otherClause.u)) + // Emit implicit barrier to synchronize threads and avoid data + // races on post-update of lastprivate variables when `nowait` + // clause is present. + firOpBuilder.create( + converter.getCurrentLocation()); + firOpBuilder.setInsertionPointToStart( + &ifOp.getThenRegion().front()); + lastPrivIP = firOpBuilder.saveInsertionPoint(); + firOpBuilder.setInsertionPoint(ifOp); + insPt = firOpBuilder.saveInsertionPoint(); + } else { + // Lastprivate operation is inserted at the end + // of the lexically last section in the sections + // construct + mlir::OpBuilder::InsertPoint unstructuredSectionsIP = + firOpBuilder.saveInsertionPoint(); + mlir::Operation *lastOper = op->getRegion(0).back().getTerminator(); + firOpBuilder.setInsertionPoint(lastOper); + lastPrivIP = firOpBuilder.saveInsertionPoint(); + firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP); + } + } + } else if (mlir::isa(op)) { + // Update the original variable just before exiting the worksharing + // loop. Conversion as follows: + // + // omp.wsloop { + // omp.wsloop { ... + // ... store + // store ===> %v = arith.addi %iv, %step + // omp.yield %cmp = %step < 0 ? %v < %ub : %v > %ub + // } fir.if %cmp { + // fir.store %v to %loopIV + // ^%lpv_update_blk: + // } + // omp.yield + // } + // + + // Only generate the compare once in presence of multiple LastPrivate + // clauses. + if (cmpCreated) + continue; + cmpCreated = true; + + mlir::Location loc = op->getLoc(); + mlir::Operation *lastOper = op->getRegion(0).back().getTerminator(); + firOpBuilder.setInsertionPoint(lastOper); + + mlir::Value iv = op->getRegion(0).front().getArguments()[0]; + mlir::Value ub = + mlir::dyn_cast(op).getUpperBound()[0]; + mlir::Value step = mlir::dyn_cast(op).getStep()[0]; + + // v = iv + step + // cmp = step < 0 ? v < ub : v > ub + mlir::Value v = firOpBuilder.create(loc, iv, step); + mlir::Value zero = + firOpBuilder.createIntegerConstant(loc, step.getType(), 0); + mlir::Value negativeStep = firOpBuilder.create( + loc, mlir::arith::CmpIPredicate::slt, step, zero); + mlir::Value vLT = firOpBuilder.create( + loc, mlir::arith::CmpIPredicate::slt, v, ub); + mlir::Value vGT = firOpBuilder.create( + loc, mlir::arith::CmpIPredicate::sgt, v, ub); + mlir::Value cmpOp = firOpBuilder.create( + loc, negativeStep, vLT, vGT); + + auto ifOp = firOpBuilder.create(loc, cmpOp, /*else*/ false); + firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + assert(loopIV && "loopIV was not set"); + firOpBuilder.create(op->getLoc(), v, loopIV); + lastPrivIP = firOpBuilder.saveInsertionPoint(); + } else { + TODO(converter.getCurrentLocation(), + "lastprivate clause in constructs other than " + "simd/worksharing-loop"); + } + } + } + firOpBuilder.restoreInsertionPoint(localInsPt); +} + +void DataSharingProcessor::collectSymbols( + Fortran::semantics::Symbol::Flag flag) { + converter.collectSymbolSet(eval, defaultSymbols, flag, + /*collectSymbols=*/true, + /*collectHostAssociatedSymbols=*/true); + for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) { + if (e.hasNestedEvaluations()) + converter.collectSymbolSet(e, symbolsInNestedRegions, flag, + /*collectSymbols=*/true, + /*collectHostAssociatedSymbols=*/false); + else + converter.collectSymbolSet(e, symbolsInParentRegions, flag, + /*collectSymbols=*/false, + /*collectHostAssociatedSymbols=*/true); + } +} + +void DataSharingProcessor::collectDefaultSymbols() { + for (const Fortran::parser::OmpClause &clause : opClauseList.v) { + if (const auto &defaultClause = + std::get_if(&clause.u)) { + if (defaultClause->v.v == + Fortran::parser::OmpDefaultClause::Type::Private) + collectSymbols(Fortran::semantics::Symbol::Flag::OmpPrivate); + else if (defaultClause->v.v == + Fortran::parser::OmpDefaultClause::Type::Firstprivate) + collectSymbols(Fortran::semantics::Symbol::Flag::OmpFirstPrivate); + } + } +} + +void DataSharingProcessor::privatize() { + for (const Fortran::semantics::Symbol *sym : privatizedSymbols) { + if (const auto *commonDet = + sym->detailsIf()) { + for (const auto &mem : commonDet->objects()) { + cloneSymbol(&*mem); + copyFirstPrivateSymbol(&*mem); + } + } else { + cloneSymbol(sym); + copyFirstPrivateSymbol(sym); + } + } +} + +void DataSharingProcessor::copyLastPrivatize(mlir::Operation *op) { + insertLastPrivateCompare(op); + for (const Fortran::semantics::Symbol *sym : privatizedSymbols) + if (const auto *commonDet = + sym->detailsIf()) { + for (const auto &mem : commonDet->objects()) { + copyLastPrivateSymbol(&*mem, &lastPrivIP); + } + } else { + copyLastPrivateSymbol(sym, &lastPrivIP); + } +} + +void DataSharingProcessor::defaultPrivatize() { + for (const Fortran::semantics::Symbol *sym : defaultSymbols) { + if (!Fortran::semantics::IsProcedure(*sym) && + !sym->GetUltimate().has() && + !sym->GetUltimate().has() && + !symbolsInNestedRegions.contains(sym) && + !symbolsInParentRegions.contains(sym) && + !privatizedSymbols.contains(sym)) { + cloneSymbol(sym); + copyFirstPrivateSymbol(sym); + } + } +} + +} // namespace omp +} // namespace lower +} // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h new file mode 100644 index 0000000000000..10c0a30c09c39 --- /dev/null +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h @@ -0,0 +1,89 @@ +//===-- Lower/OpenMP/DataSharingProcessor.h ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// +#ifndef FORTRAN_LOWER_DATASHARINGPROCESSOR_H +#define FORTRAN_LOWER_DATASHARINGPROCESSOR_H + +#include "flang/Lower/AbstractConverter.h" +#include "flang/Lower/OpenMP.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Parser/parse-tree.h" +#include "flang/Semantics/symbol.h" + +namespace Fortran { +namespace lower { +namespace omp { + +class DataSharingProcessor { + bool hasLastPrivateOp; + mlir::OpBuilder::InsertPoint lastPrivIP; + mlir::OpBuilder::InsertPoint insPt; + mlir::Value loopIV; + // Symbols in private, firstprivate, and/or lastprivate clauses. + llvm::SetVector privatizedSymbols; + llvm::SetVector defaultSymbols; + llvm::SetVector symbolsInNestedRegions; + llvm::SetVector symbolsInParentRegions; + Fortran::lower::AbstractConverter &converter; + fir::FirOpBuilder &firOpBuilder; + const Fortran::parser::OmpClauseList &opClauseList; + Fortran::lower::pft::Evaluation &eval; + + bool needBarrier(); + void collectSymbols(Fortran::semantics::Symbol::Flag flag); + void collectOmpObjectListSymbol( + const Fortran::parser::OmpObjectList &ompObjectList, + llvm::SetVector &symbolSet); + void collectSymbolsForPrivatization(); + void insertBarrier(); + void collectDefaultSymbols(); + void privatize(); + void defaultPrivatize(); + void copyLastPrivatize(mlir::Operation *op); + void insertLastPrivateCompare(mlir::Operation *op); + void cloneSymbol(const Fortran::semantics::Symbol *sym); + void copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym); + void copyLastPrivateSymbol(const Fortran::semantics::Symbol *sym, + mlir::OpBuilder::InsertPoint *lastPrivIP); + void insertDeallocs(); + +public: + DataSharingProcessor(Fortran::lower::AbstractConverter &converter, + const Fortran::parser::OmpClauseList &opClauseList, + Fortran::lower::pft::Evaluation &eval) + : hasLastPrivateOp(false), converter(converter), + firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList), + eval(eval) {} + // Privatisation is split into two steps. + // Step1 performs cloning of all privatisation clauses and copying for + // firstprivates. Step1 is performed at the place where process/processStep1 + // is called. This is usually inside the Operation corresponding to the OpenMP + // construct, for looping constructs this is just before the Operation. The + // split into two steps was performed basically to be able to call + // privatisation for looping constructs before the operation is created since + // the bounds of the MLIR OpenMP operation can be privatised. + // Step2 performs the copying for lastprivates and requires knowledge of the + // MLIR operation to insert the last private update. Step2 adds + // dealocation code as well. + void processStep1(); + void processStep2(mlir::Operation *op, bool isLoop); + + void setLoopIV(mlir::Value iv) { + assert(!loopIV && "Loop iteration variable already set"); + loopIV = iv; + } +}; + +} // namespace omp +} // namespace lower +} // namespace Fortran + +#endif // FORTRAN_LOWER_DATASHARINGPROCESSOR_H diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp similarity index 55% rename from flang/lib/Lower/OpenMP.cpp rename to flang/lib/Lower/OpenMP/OpenMP.cpp index 9397af8b8bd05..3aefad6cf0ec1 100644 --- a/flang/lib/Lower/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -11,109 +11,36 @@ //===----------------------------------------------------------------------===// #include "flang/Lower/OpenMP.h" + +#include "ClauseProcessor.h" +#include "DataSharingProcessor.h" #include "DirectivesCommon.h" +#include "ReductionProcessor.h" #include "flang/Common/idioms.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/ConvertExpr.h" #include "flang/Lower/ConvertVariable.h" -#include "flang/Lower/PFTBuilder.h" #include "flang/Lower/StatementContext.h" #include "flang/Lower/SymbolMap.h" #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" -#include "flang/Parser/dump-parse-tree.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/openmp-directive-sets.h" #include "flang/Semantics/tools.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" -#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" -#include "llvm/Support/CommandLine.h" - -static llvm::cl::opt treatIndexAsSection( - "openmp-treat-index-as-section", - llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."), - llvm::cl::init(true)); -using DeclareTargetCapturePair = - std::pair; +using namespace Fortran::lower::omp; //===----------------------------------------------------------------------===// -// Common helper functions +// Code generation helper functions //===----------------------------------------------------------------------===// -static Fortran::semantics::Symbol * -getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) { - Fortran::semantics::Symbol *sym = nullptr; - std::visit( - Fortran::common::visitors{ - [&](const Fortran::parser::Designator &designator) { - if (auto *arrayEle = - Fortran::parser::Unwrap( - designator)) { - sym = GetFirstName(arrayEle->base).symbol; - } else if (auto *structComp = Fortran::parser::Unwrap< - Fortran::parser::StructureComponent>(designator)) { - sym = structComp->component.symbol; - } else if (const Fortran::parser::Name *name = - Fortran::semantics::getDesignatorNameIfDataRef( - designator)) { - sym = name->symbol; - } - }, - [&](const Fortran::parser::Name &name) { sym = name.symbol; }}, - ompObject.u); - return sym; -} - -static void genObjectList(const Fortran::parser::OmpObjectList &objectList, - Fortran::lower::AbstractConverter &converter, - llvm::SmallVectorImpl &operands) { - auto addOperands = [&](Fortran::lower::SymbolRef sym) { - const mlir::Value variable = converter.getSymbolAddress(sym); - if (variable) { - operands.push_back(variable); - } else { - if (const auto *details = - sym->detailsIf()) { - operands.push_back(converter.getSymbolAddress(details->symbol())); - converter.copySymbolBinding(details->symbol(), sym); - } - } - }; - for (const Fortran::parser::OmpObject &ompObject : objectList.v) { - Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); - addOperands(*sym); - } -} - -static void gatherFuncAndVarSyms( - const Fortran::parser::OmpObjectList &objList, - mlir::omp::DeclareTargetCaptureClause clause, - llvm::SmallVectorImpl &symbolAndClause) { - for (const Fortran::parser::OmpObject &ompObject : objList.v) { - Fortran::common::visit( - Fortran::common::visitors{ - [&](const Fortran::parser::Designator &designator) { - if (const Fortran::parser::Name *name = - Fortran::semantics::getDesignatorNameIfDataRef( - designator)) { - symbolAndClause.emplace_back(clause, *name->symbol); - } - }, - [&](const Fortran::parser::Name &name) { - symbolAndClause.emplace_back(clause, *name.symbol); - }}, - ompObject.u); - } -} - static Fortran::lower::pft::Evaluation * getCollapsedLoopEval(Fortran::lower::pft::Evaluation &eval, int collapseValue) { // Return the Evaluation of the innermost collapsed loop, or the current one @@ -142,1961 +69,6 @@ static void genNestedEvaluations(Fortran::lower::AbstractConverter &converter, converter.genEval(e); } -//===----------------------------------------------------------------------===// -// DataSharingProcessor -//===----------------------------------------------------------------------===// - -class DataSharingProcessor { - bool hasLastPrivateOp; - mlir::OpBuilder::InsertPoint lastPrivIP; - mlir::OpBuilder::InsertPoint insPt; - mlir::Value loopIV; - // Symbols in private, firstprivate, and/or lastprivate clauses. - llvm::SetVector privatizedSymbols; - llvm::SetVector defaultSymbols; - llvm::SetVector symbolsInNestedRegions; - llvm::SetVector symbolsInParentRegions; - Fortran::lower::AbstractConverter &converter; - fir::FirOpBuilder &firOpBuilder; - const Fortran::parser::OmpClauseList &opClauseList; - Fortran::lower::pft::Evaluation &eval; - - bool needBarrier(); - void collectSymbols(Fortran::semantics::Symbol::Flag flag); - void collectOmpObjectListSymbol( - const Fortran::parser::OmpObjectList &ompObjectList, - llvm::SetVector &symbolSet); - void collectSymbolsForPrivatization(); - void insertBarrier(); - void collectDefaultSymbols(); - void privatize(); - void defaultPrivatize(); - void copyLastPrivatize(mlir::Operation *op); - void insertLastPrivateCompare(mlir::Operation *op); - void cloneSymbol(const Fortran::semantics::Symbol *sym); - void copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym); - void copyLastPrivateSymbol(const Fortran::semantics::Symbol *sym, - mlir::OpBuilder::InsertPoint *lastPrivIP); - void insertDeallocs(); - -public: - DataSharingProcessor(Fortran::lower::AbstractConverter &converter, - const Fortran::parser::OmpClauseList &opClauseList, - Fortran::lower::pft::Evaluation &eval) - : hasLastPrivateOp(false), converter(converter), - firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList), - eval(eval) {} - // Privatisation is split into two steps. - // Step1 performs cloning of all privatisation clauses and copying for - // firstprivates. Step1 is performed at the place where process/processStep1 - // is called. This is usually inside the Operation corresponding to the OpenMP - // construct, for looping constructs this is just before the Operation. The - // split into two steps was performed basically to be able to call - // privatisation for looping constructs before the operation is created since - // the bounds of the MLIR OpenMP operation can be privatised. - // Step2 performs the copying for lastprivates and requires knowledge of the - // MLIR operation to insert the last private update. Step2 adds - // dealocation code as well. - void processStep1(); - void processStep2(mlir::Operation *op, bool isLoop); - - void setLoopIV(mlir::Value iv) { - assert(!loopIV && "Loop iteration variable already set"); - loopIV = iv; - } -}; - -void DataSharingProcessor::processStep1() { - collectSymbolsForPrivatization(); - collectDefaultSymbols(); - privatize(); - defaultPrivatize(); - insertBarrier(); -} - -void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) { - insPt = firOpBuilder.saveInsertionPoint(); - copyLastPrivatize(op); - firOpBuilder.restoreInsertionPoint(insPt); - - if (isLoop) { - // push deallocs out of the loop - firOpBuilder.setInsertionPointAfter(op); - insertDeallocs(); - } else { - // insert dummy instruction to mark the insertion position - mlir::Value undefMarker = firOpBuilder.create( - op->getLoc(), firOpBuilder.getIndexType()); - insertDeallocs(); - firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp()); - } -} - -void DataSharingProcessor::insertDeallocs() { - for (const Fortran::semantics::Symbol *sym : privatizedSymbols) - if (Fortran::semantics::IsAllocatable(sym->GetUltimate())) { - converter.createHostAssociateVarCloneDealloc(*sym); - } -} - -void DataSharingProcessor::cloneSymbol(const Fortran::semantics::Symbol *sym) { - // Privatization for symbols which are pre-determined (like loop index - // variables) happen separately, for everything else privatize here. - if (sym->test(Fortran::semantics::Symbol::Flag::OmpPreDetermined)) - return; - bool success = converter.createHostAssociateVarClone(*sym); - (void)success; - assert(success && "Privatization failed due to existing binding"); -} - -void DataSharingProcessor::copyFirstPrivateSymbol( - const Fortran::semantics::Symbol *sym) { - if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate)) - converter.copyHostAssociateVar(*sym); -} - -void DataSharingProcessor::copyLastPrivateSymbol( - const Fortran::semantics::Symbol *sym, - [[maybe_unused]] mlir::OpBuilder::InsertPoint *lastPrivIP) { - if (sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) - converter.copyHostAssociateVar(*sym, lastPrivIP); -} - -void DataSharingProcessor::collectOmpObjectListSymbol( - const Fortran::parser::OmpObjectList &ompObjectList, - llvm::SetVector &symbolSet) { - for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) { - Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); - symbolSet.insert(sym); - } -} - -void DataSharingProcessor::collectSymbolsForPrivatization() { - bool hasCollapse = false; - for (const Fortran::parser::OmpClause &clause : opClauseList.v) { - if (const auto &privateClause = - std::get_if(&clause.u)) { - collectOmpObjectListSymbol(privateClause->v, privatizedSymbols); - } else if (const auto &firstPrivateClause = - std::get_if( - &clause.u)) { - collectOmpObjectListSymbol(firstPrivateClause->v, privatizedSymbols); - } else if (const auto &lastPrivateClause = - std::get_if( - &clause.u)) { - collectOmpObjectListSymbol(lastPrivateClause->v, privatizedSymbols); - hasLastPrivateOp = true; - } else if (std::get_if(&clause.u)) { - hasCollapse = true; - } - } - - if (hasCollapse && hasLastPrivateOp) - TODO(converter.getCurrentLocation(), "Collapse clause with lastprivate"); -} - -bool DataSharingProcessor::needBarrier() { - for (const Fortran::semantics::Symbol *sym : privatizedSymbols) { - if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) && - sym->test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) - return true; - } - return false; -} - -void DataSharingProcessor::insertBarrier() { - // Emit implicit barrier to synchronize threads and avoid data races on - // initialization of firstprivate variables and post-update of lastprivate - // variables. - // FIXME: Emit barrier for lastprivate clause when 'sections' directive has - // 'nowait' clause. Otherwise, emit barrier when 'sections' directive has - // both firstprivate and lastprivate clause. - // Emit implicit barrier for linear clause. Maybe on somewhere else. - if (needBarrier()) - firOpBuilder.create(converter.getCurrentLocation()); -} - -void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { - bool cmpCreated = false; - mlir::OpBuilder::InsertPoint localInsPt = firOpBuilder.saveInsertionPoint(); - for (const Fortran::parser::OmpClause &clause : opClauseList.v) { - if (std::get_if(&clause.u)) { - // TODO: Add lastprivate support for simd construct - if (mlir::isa(op)) { - if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) { - // For `omp.sections`, lastprivatized variables occur in - // lexically final `omp.section` operation. The following FIR - // shall be generated for the same: - // - // omp.sections lastprivate(...) { - // omp.section {...} - // omp.section {...} - // omp.section { - // fir.allocate for `private`/`firstprivate` - // - // fir.if %true { - // ^%lpv_update_blk - // } - // } - // } - // - // To keep code consistency while handling privatization - // through this control flow, add a `fir.if` operation - // that always evaluates to true, in order to create - // a dedicated sub-region in `omp.section` where - // lastprivate FIR can reside. Later canonicalizations - // will optimize away this operation. - if (!eval.lowerAsUnstructured()) { - auto ifOp = firOpBuilder.create( - op->getLoc(), - firOpBuilder.createIntegerConstant( - op->getLoc(), firOpBuilder.getIntegerType(1), 0x1), - /*else*/ false); - firOpBuilder.setInsertionPointToStart( - &ifOp.getThenRegion().front()); - - const Fortran::parser::OpenMPConstruct *parentOmpConstruct = - eval.parentConstruct->getIf(); - assert(parentOmpConstruct && - "Expected a valid enclosing OpenMP construct"); - const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct = - std::get_if( - &parentOmpConstruct->u); - assert(sectionsConstruct && - "Expected an enclosing omp.sections construct"); - const Fortran::parser::OmpClauseList §ionsEndClauseList = - std::get( - std::get( - sectionsConstruct->t) - .t); - for (const Fortran::parser::OmpClause &otherClause : - sectionsEndClauseList.v) - if (std::get_if( - &otherClause.u)) - // Emit implicit barrier to synchronize threads and avoid data - // races on post-update of lastprivate variables when `nowait` - // clause is present. - firOpBuilder.create( - converter.getCurrentLocation()); - firOpBuilder.setInsertionPointToStart( - &ifOp.getThenRegion().front()); - lastPrivIP = firOpBuilder.saveInsertionPoint(); - firOpBuilder.setInsertionPoint(ifOp); - insPt = firOpBuilder.saveInsertionPoint(); - } else { - // Lastprivate operation is inserted at the end - // of the lexically last section in the sections - // construct - mlir::OpBuilder::InsertPoint unstructuredSectionsIP = - firOpBuilder.saveInsertionPoint(); - mlir::Operation *lastOper = op->getRegion(0).back().getTerminator(); - firOpBuilder.setInsertionPoint(lastOper); - lastPrivIP = firOpBuilder.saveInsertionPoint(); - firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP); - } - } - } else if (mlir::isa(op)) { - // Update the original variable just before exiting the worksharing - // loop. Conversion as follows: - // - // omp.wsloop { - // omp.wsloop { ... - // ... store - // store ===> %v = arith.addi %iv, %step - // omp.yield %cmp = %step < 0 ? %v < %ub : %v > %ub - // } fir.if %cmp { - // fir.store %v to %loopIV - // ^%lpv_update_blk: - // } - // omp.yield - // } - // - - // Only generate the compare once in presence of multiple LastPrivate - // clauses. - if (cmpCreated) - continue; - cmpCreated = true; - - mlir::Location loc = op->getLoc(); - mlir::Operation *lastOper = op->getRegion(0).back().getTerminator(); - firOpBuilder.setInsertionPoint(lastOper); - - mlir::Value iv = op->getRegion(0).front().getArguments()[0]; - mlir::Value ub = - mlir::dyn_cast(op).getUpperBound()[0]; - mlir::Value step = mlir::dyn_cast(op).getStep()[0]; - - // v = iv + step - // cmp = step < 0 ? v < ub : v > ub - mlir::Value v = firOpBuilder.create(loc, iv, step); - mlir::Value zero = - firOpBuilder.createIntegerConstant(loc, step.getType(), 0); - mlir::Value negativeStep = firOpBuilder.create( - loc, mlir::arith::CmpIPredicate::slt, step, zero); - mlir::Value vLT = firOpBuilder.create( - loc, mlir::arith::CmpIPredicate::slt, v, ub); - mlir::Value vGT = firOpBuilder.create( - loc, mlir::arith::CmpIPredicate::sgt, v, ub); - mlir::Value cmpOp = firOpBuilder.create( - loc, negativeStep, vLT, vGT); - - auto ifOp = firOpBuilder.create(loc, cmpOp, /*else*/ false); - firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front()); - assert(loopIV && "loopIV was not set"); - firOpBuilder.create(op->getLoc(), v, loopIV); - lastPrivIP = firOpBuilder.saveInsertionPoint(); - } else { - TODO(converter.getCurrentLocation(), - "lastprivate clause in constructs other than " - "simd/worksharing-loop"); - } - } - } - firOpBuilder.restoreInsertionPoint(localInsPt); -} - -void DataSharingProcessor::collectSymbols( - Fortran::semantics::Symbol::Flag flag) { - converter.collectSymbolSet(eval, defaultSymbols, flag, - /*collectSymbols=*/true, - /*collectHostAssociatedSymbols=*/true); - for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) { - if (e.hasNestedEvaluations()) - converter.collectSymbolSet(e, symbolsInNestedRegions, flag, - /*collectSymbols=*/true, - /*collectHostAssociatedSymbols=*/false); - else - converter.collectSymbolSet(e, symbolsInParentRegions, flag, - /*collectSymbols=*/false, - /*collectHostAssociatedSymbols=*/true); - } -} - -void DataSharingProcessor::collectDefaultSymbols() { - for (const Fortran::parser::OmpClause &clause : opClauseList.v) { - if (const auto &defaultClause = - std::get_if(&clause.u)) { - if (defaultClause->v.v == - Fortran::parser::OmpDefaultClause::Type::Private) - collectSymbols(Fortran::semantics::Symbol::Flag::OmpPrivate); - else if (defaultClause->v.v == - Fortran::parser::OmpDefaultClause::Type::Firstprivate) - collectSymbols(Fortran::semantics::Symbol::Flag::OmpFirstPrivate); - } - } -} - -void DataSharingProcessor::privatize() { - for (const Fortran::semantics::Symbol *sym : privatizedSymbols) { - if (const auto *commonDet = - sym->detailsIf()) { - for (const auto &mem : commonDet->objects()) { - cloneSymbol(&*mem); - copyFirstPrivateSymbol(&*mem); - } - } else { - cloneSymbol(sym); - copyFirstPrivateSymbol(sym); - } - } -} - -void DataSharingProcessor::copyLastPrivatize(mlir::Operation *op) { - insertLastPrivateCompare(op); - for (const Fortran::semantics::Symbol *sym : privatizedSymbols) - if (const auto *commonDet = - sym->detailsIf()) { - for (const auto &mem : commonDet->objects()) { - copyLastPrivateSymbol(&*mem, &lastPrivIP); - } - } else { - copyLastPrivateSymbol(sym, &lastPrivIP); - } -} - -void DataSharingProcessor::defaultPrivatize() { - for (const Fortran::semantics::Symbol *sym : defaultSymbols) { - if (!Fortran::semantics::IsProcedure(*sym) && - !sym->GetUltimate().has() && - !sym->GetUltimate().has() && - !symbolsInNestedRegions.contains(sym) && - !symbolsInParentRegions.contains(sym) && - !privatizedSymbols.contains(sym)) { - cloneSymbol(sym); - copyFirstPrivateSymbol(sym); - } - } -} - -//===----------------------------------------------------------------------===// -// ClauseProcessor -//===----------------------------------------------------------------------===// - -/// Class that handles the processing of OpenMP clauses. -/// -/// Its `process()` methods perform MLIR code generation for their -/// corresponding clause if it is present in the clause list. Otherwise, they -/// will return `false` to signal that the clause was not found. -/// -/// The intended use is of this class is to move clause processing outside of -/// construct processing, since the same clauses can appear attached to -/// different constructs and constructs can be combined, so that code -/// duplication is minimized. -/// -/// Each construct-lowering function only calls the `process()` -/// methods that relate to clauses that can impact the lowering of that -/// construct. -class ClauseProcessor { - using ClauseTy = Fortran::parser::OmpClause; - -public: - ClauseProcessor(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - const Fortran::parser::OmpClauseList &clauses) - : converter(converter), semaCtx(semaCtx), clauses(clauses) {} - - // 'Unique' clauses: They can appear at most once in the clause list. - bool - processCollapse(mlir::Location currentLocation, - Fortran::lower::pft::Evaluation &eval, - llvm::SmallVectorImpl &lowerBound, - llvm::SmallVectorImpl &upperBound, - llvm::SmallVectorImpl &step, - llvm::SmallVectorImpl &iv, - std::size_t &loopVarTypeSize) const; - bool processDefault() const; - bool processDevice(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processDeviceType(mlir::omp::DeclareTargetDeviceType &result) const; - bool processFinal(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processHint(mlir::IntegerAttr &result) const; - bool processMergeable(mlir::UnitAttr &result) const; - bool processNowait(mlir::UnitAttr &result) const; - bool processNumTeams(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processNumThreads(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processOrdered(mlir::IntegerAttr &result) const; - bool processPriority(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processProcBind(mlir::omp::ClauseProcBindKindAttr &result) const; - bool processSafelen(mlir::IntegerAttr &result) const; - bool processSchedule(mlir::omp::ClauseScheduleKindAttr &valAttr, - mlir::omp::ScheduleModifierAttr &modifierAttr, - mlir::UnitAttr &simdModifierAttr) const; - bool processScheduleChunk(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processSimdlen(mlir::IntegerAttr &result) const; - bool processThreadLimit(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const; - bool processUntied(mlir::UnitAttr &result) const; - - // 'Repeatable' clauses: They can appear multiple times in the clause list. - bool - processAllocate(llvm::SmallVectorImpl &allocatorOperands, - llvm::SmallVectorImpl &allocateOperands) const; - bool processCopyin() const; - bool processDepend(llvm::SmallVectorImpl &dependTypeOperands, - llvm::SmallVectorImpl &dependOperands) const; - bool - processEnter(llvm::SmallVectorImpl &result) const; - bool - processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName, - mlir::Value &result) const; - bool - processLink(llvm::SmallVectorImpl &result) const; - - // This method is used to process a map clause. - // The optional parameters - mapSymTypes, mapSymLocs & mapSymbols are used to - // store the original type, location and Fortran symbol for the map operands. - // They may be used later on to create the block_arguments for some of the - // target directives that require it. - bool processMap(mlir::Location currentLocation, - const llvm::omp::Directive &directive, - Fortran::lower::StatementContext &stmtCtx, - llvm::SmallVectorImpl &mapOperands, - llvm::SmallVectorImpl *mapSymTypes = nullptr, - llvm::SmallVectorImpl *mapSymLocs = nullptr, - llvm::SmallVectorImpl - *mapSymbols = nullptr) const; - bool - processReduction(mlir::Location currentLocation, - llvm::SmallVectorImpl &reductionVars, - llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl - *reductionSymbols = nullptr) const; - bool processSectionsReduction(mlir::Location currentLocation) const; - bool processTo(llvm::SmallVectorImpl &result) const; - bool - processUseDeviceAddr(llvm::SmallVectorImpl &operands, - llvm::SmallVectorImpl &useDeviceTypes, - llvm::SmallVectorImpl &useDeviceLocs, - llvm::SmallVectorImpl - &useDeviceSymbols) const; - bool - processUseDevicePtr(llvm::SmallVectorImpl &operands, - llvm::SmallVectorImpl &useDeviceTypes, - llvm::SmallVectorImpl &useDeviceLocs, - llvm::SmallVectorImpl - &useDeviceSymbols) const; - - template - bool processMotionClauses(Fortran::lower::StatementContext &stmtCtx, - llvm::SmallVectorImpl &mapOperands); - - // Call this method for these clauses that should be supported but are not - // implemented yet. It triggers a compilation error if any of the given - // clauses is found. - template - void processTODO(mlir::Location currentLocation, - llvm::omp::Directive directive) const; - -private: - using ClauseIterator = std::list::const_iterator; - - /// Utility to find a clause within a range in the clause list. - template - static ClauseIterator findClause(ClauseIterator begin, ClauseIterator end) { - for (ClauseIterator it = begin; it != end; ++it) { - if (std::get_if(&it->u)) - return it; - } - - return end; - } - - /// Return the first instance of the given clause found in the clause list or - /// `nullptr` if not present. If more than one instance is expected, use - /// `findRepeatableClause` instead. - template - const T * - findUniqueClause(const Fortran::parser::CharBlock **source = nullptr) const { - ClauseIterator it = findClause(clauses.v.begin(), clauses.v.end()); - if (it != clauses.v.end()) { - if (source) - *source = &it->source; - return &std::get(it->u); - } - return nullptr; - } - - /// Call `callbackFn` for each occurrence of the given clause. Return `true` - /// if at least one instance was found. - template - bool findRepeatableClause( - std::function - callbackFn) const { - bool found = false; - ClauseIterator nextIt, endIt = clauses.v.end(); - for (ClauseIterator it = clauses.v.begin(); it != endIt; it = nextIt) { - nextIt = findClause(it, endIt); - - if (nextIt != endIt) { - callbackFn(&std::get(nextIt->u), nextIt->source); - found = true; - ++nextIt; - } - } - return found; - } - - /// Set the `result` to a new `mlir::UnitAttr` if the clause is present. - template - bool markClauseOccurrence(mlir::UnitAttr &result) const { - if (findUniqueClause()) { - result = converter.getFirOpBuilder().getUnitAttr(); - return true; - } - return false; - } - - Fortran::lower::AbstractConverter &converter; - Fortran::semantics::SemanticsContext &semaCtx; - const Fortran::parser::OmpClauseList &clauses; -}; - -//===----------------------------------------------------------------------===// -// ClauseProcessor helper functions -//===----------------------------------------------------------------------===// - -/// Check for unsupported map operand types. -static void checkMapType(mlir::Location location, mlir::Type type) { - if (auto refType = type.dyn_cast()) - type = refType.getElementType(); - if (auto boxType = type.dyn_cast_or_null()) - if (!boxType.getElementType().isa()) - TODO(location, "OMPD_target_data MapOperand BoxType"); -} - -class ReductionProcessor { -public: - // TODO: Move this enumeration to the OpenMP dialect - enum ReductionIdentifier { - ID, - USER_DEF_OP, - ADD, - SUBTRACT, - MULTIPLY, - AND, - OR, - EQV, - NEQV, - MAX, - MIN, - IAND, - IOR, - IEOR - }; - static ReductionIdentifier - getReductionType(const Fortran::parser::ProcedureDesignator &pd) { - auto redType = llvm::StringSwitch>( - getRealName(pd).ToString()) - .Case("max", ReductionIdentifier::MAX) - .Case("min", ReductionIdentifier::MIN) - .Case("iand", ReductionIdentifier::IAND) - .Case("ior", ReductionIdentifier::IOR) - .Case("ieor", ReductionIdentifier::IEOR) - .Default(std::nullopt); - assert(redType && "Invalid Reduction"); - return *redType; - } - - static ReductionIdentifier getReductionType( - Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) { - switch (intrinsicOp) { - case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: - return ReductionIdentifier::ADD; - case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract: - return ReductionIdentifier::SUBTRACT; - case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: - return ReductionIdentifier::MULTIPLY; - case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: - return ReductionIdentifier::AND; - case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: - return ReductionIdentifier::EQV; - case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: - return ReductionIdentifier::OR; - case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: - return ReductionIdentifier::NEQV; - default: - llvm_unreachable("unexpected intrinsic operator in reduction"); - } - } - - static bool supportedIntrinsicProcReduction( - const Fortran::parser::ProcedureDesignator &pd) { - const auto *name{Fortran::parser::Unwrap(pd)}; - assert(name && "Invalid Reduction Intrinsic."); - if (!name->symbol->GetUltimate().attrs().test( - Fortran::semantics::Attr::INTRINSIC)) - return false; - auto redType = llvm::StringSwitch(getRealName(name).ToString()) - .Case("max", true) - .Case("min", true) - .Case("iand", true) - .Case("ior", true) - .Case("ieor", true) - .Default(false); - return redType; - } - - static const Fortran::semantics::SourceName - getRealName(const Fortran::parser::Name *name) { - return name->symbol->GetUltimate().name(); - } - - static const Fortran::semantics::SourceName - getRealName(const Fortran::parser::ProcedureDesignator &pd) { - const auto *name{Fortran::parser::Unwrap(pd)}; - assert(name && "Invalid Reduction Intrinsic."); - return getRealName(name); - } - - static std::string getReductionName(llvm::StringRef name, mlir::Type ty) { - return (llvm::Twine(name) + - (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) + - llvm::Twine(ty.getIntOrFloatBitWidth())) - .str(); - } - - static std::string getReductionName( - Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp, - mlir::Type ty) { - std::string reductionName; - - switch (intrinsicOp) { - case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: - reductionName = "add_reduction"; - break; - case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: - reductionName = "multiply_reduction"; - break; - case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: - return "and_reduction"; - case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: - return "eqv_reduction"; - case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: - return "or_reduction"; - case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: - return "neqv_reduction"; - default: - reductionName = "other_reduction"; - break; - } - - return getReductionName(reductionName, ty); - } - - /// This function returns the identity value of the operator \p - /// reductionOpName. For example: - /// 0 + x = x, - /// 1 * x = x - static int getOperationIdentity(ReductionIdentifier redId, - mlir::Location loc) { - switch (redId) { - case ReductionIdentifier::ADD: - case ReductionIdentifier::OR: - case ReductionIdentifier::NEQV: - return 0; - case ReductionIdentifier::MULTIPLY: - case ReductionIdentifier::AND: - case ReductionIdentifier::EQV: - return 1; - default: - TODO(loc, "Reduction of some intrinsic operators is not supported"); - } - } - - static mlir::Value getReductionInitValue(mlir::Location loc, mlir::Type type, - ReductionIdentifier redId, - fir::FirOpBuilder &builder) { - assert((fir::isa_integer(type) || fir::isa_real(type) || - type.isa()) && - "only integer, logical and real types are currently supported"); - switch (redId) { - case ReductionIdentifier::MAX: { - if (auto ty = type.dyn_cast()) { - const llvm::fltSemantics &sem = ty.getFloatSemantics(); - return builder.createRealConstant( - loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/true)); - } - unsigned bits = type.getIntOrFloatBitWidth(); - int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue(); - return builder.createIntegerConstant(loc, type, minInt); - } - case ReductionIdentifier::MIN: { - if (auto ty = type.dyn_cast()) { - const llvm::fltSemantics &sem = ty.getFloatSemantics(); - return builder.createRealConstant( - loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/false)); - } - unsigned bits = type.getIntOrFloatBitWidth(); - int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue(); - return builder.createIntegerConstant(loc, type, maxInt); - } - case ReductionIdentifier::IOR: { - unsigned bits = type.getIntOrFloatBitWidth(); - int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue(); - return builder.createIntegerConstant(loc, type, zeroInt); - } - case ReductionIdentifier::IEOR: { - unsigned bits = type.getIntOrFloatBitWidth(); - int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue(); - return builder.createIntegerConstant(loc, type, zeroInt); - } - case ReductionIdentifier::IAND: { - unsigned bits = type.getIntOrFloatBitWidth(); - int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue(); - return builder.createIntegerConstant(loc, type, allOnInt); - } - case ReductionIdentifier::ADD: - case ReductionIdentifier::MULTIPLY: - case ReductionIdentifier::AND: - case ReductionIdentifier::OR: - case ReductionIdentifier::EQV: - case ReductionIdentifier::NEQV: - if (type.isa()) - return builder.create( - loc, type, - builder.getFloatAttr(type, - (double)getOperationIdentity(redId, loc))); - - if (type.isa()) { - mlir::Value intConst = builder.create( - loc, builder.getI1Type(), - builder.getIntegerAttr(builder.getI1Type(), - getOperationIdentity(redId, loc))); - return builder.createConvert(loc, type, intConst); - } - - return builder.create( - loc, type, - builder.getIntegerAttr(type, getOperationIdentity(redId, loc))); - case ReductionIdentifier::ID: - case ReductionIdentifier::USER_DEF_OP: - case ReductionIdentifier::SUBTRACT: - TODO(loc, "Reduction of some identifier types is not supported"); - } - llvm_unreachable("Unhandled Reduction identifier : getReductionInitValue"); - } - - template - static mlir::Value getReductionOperation(fir::FirOpBuilder &builder, - mlir::Type type, mlir::Location loc, - mlir::Value op1, mlir::Value op2) { - assert(type.isIntOrIndexOrFloat() && - "only integer and float types are currently supported"); - if (type.isIntOrIndex()) - return builder.create(loc, op1, op2); - return builder.create(loc, op1, op2); - } - - static mlir::Value createScalarCombiner(fir::FirOpBuilder &builder, - mlir::Location loc, - ReductionIdentifier redId, - mlir::Type type, mlir::Value op1, - mlir::Value op2) { - mlir::Value reductionOp; - switch (redId) { - case ReductionIdentifier::MAX: - reductionOp = - getReductionOperation( - builder, type, loc, op1, op2); - break; - case ReductionIdentifier::MIN: - reductionOp = - getReductionOperation( - builder, type, loc, op1, op2); - break; - case ReductionIdentifier::IOR: - assert((type.isIntOrIndex()) && "only integer is expected"); - reductionOp = builder.create(loc, op1, op2); - break; - case ReductionIdentifier::IEOR: - assert((type.isIntOrIndex()) && "only integer is expected"); - reductionOp = builder.create(loc, op1, op2); - break; - case ReductionIdentifier::IAND: - assert((type.isIntOrIndex()) && "only integer is expected"); - reductionOp = builder.create(loc, op1, op2); - break; - case ReductionIdentifier::ADD: - reductionOp = - getReductionOperation( - builder, type, loc, op1, op2); - break; - case ReductionIdentifier::MULTIPLY: - reductionOp = - getReductionOperation( - builder, type, loc, op1, op2); - break; - case ReductionIdentifier::AND: { - mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); - mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); - - mlir::Value andiOp = - builder.create(loc, op1I1, op2I1); - - reductionOp = builder.createConvert(loc, type, andiOp); - break; - } - case ReductionIdentifier::OR: { - mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); - mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); - - mlir::Value oriOp = builder.create(loc, op1I1, op2I1); - - reductionOp = builder.createConvert(loc, type, oriOp); - break; - } - case ReductionIdentifier::EQV: { - mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); - mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); - - mlir::Value cmpiOp = builder.create( - loc, mlir::arith::CmpIPredicate::eq, op1I1, op2I1); - - reductionOp = builder.createConvert(loc, type, cmpiOp); - break; - } - case ReductionIdentifier::NEQV: { - mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); - mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); - - mlir::Value cmpiOp = builder.create( - loc, mlir::arith::CmpIPredicate::ne, op1I1, op2I1); - - reductionOp = builder.createConvert(loc, type, cmpiOp); - break; - } - default: - TODO(loc, "Reduction of some intrinsic operators is not supported"); - } - - return reductionOp; - } - - /// Creates an OpenMP reduction declaration and inserts it into the provided - /// symbol table. The declaration has a constant initializer with the neutral - /// value `initValue`, and the reduction combiner carried over from `reduce`. - /// TODO: Generalize this for non-integer types, add atomic region. - static mlir::omp::ReductionDeclareOp createReductionDecl( - fir::FirOpBuilder &builder, llvm::StringRef reductionOpName, - const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) { - mlir::OpBuilder::InsertionGuard guard(builder); - mlir::ModuleOp module = builder.getModule(); - - auto decl = - module.lookupSymbol(reductionOpName); - if (decl) - return decl; - - mlir::OpBuilder modBuilder(module.getBodyRegion()); - - decl = modBuilder.create( - loc, reductionOpName, type); - builder.createBlock(&decl.getInitializerRegion(), - decl.getInitializerRegion().end(), {type}, {loc}); - builder.setInsertionPointToEnd(&decl.getInitializerRegion().back()); - mlir::Value init = getReductionInitValue(loc, type, redId, builder); - builder.create(loc, init); - - builder.createBlock(&decl.getReductionRegion(), - decl.getReductionRegion().end(), {type, type}, - {loc, loc}); - - builder.setInsertionPointToEnd(&decl.getReductionRegion().back()); - mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); - mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); - - mlir::Value reductionOp = - createScalarCombiner(builder, loc, redId, type, op1, op2); - builder.create(loc, reductionOp); - - return decl; - } - - /// Creates a reduction declaration and associates it with an OpenMP block - /// directive. - static void - addReductionDecl(mlir::Location currentLocation, - Fortran::lower::AbstractConverter &converter, - const Fortran::parser::OmpReductionClause &reduction, - llvm::SmallVectorImpl &reductionVars, - llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl - *reductionSymbols = nullptr) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::omp::ReductionDeclareOp decl; - const auto &redOperator{ - std::get(reduction.t)}; - const auto &objectList{ - std::get(reduction.t)}; - if (const auto &redDefinedOp = - std::get_if(&redOperator.u)) { - const auto &intrinsicOp{ - std::get( - redDefinedOp->u)}; - ReductionIdentifier redId = getReductionType(intrinsicOp); - switch (redId) { - case ReductionIdentifier::ADD: - case ReductionIdentifier::MULTIPLY: - case ReductionIdentifier::AND: - case ReductionIdentifier::EQV: - case ReductionIdentifier::OR: - case ReductionIdentifier::NEQV: - break; - default: - TODO(currentLocation, - "Reduction of some intrinsic operators is not supported"); - break; - } - for (const Fortran::parser::OmpObject &ompObject : objectList.v) { - if (const auto *name{ - Fortran::parser::Unwrap(ompObject)}) { - if (const Fortran::semantics::Symbol * symbol{name->symbol}) { - if (reductionSymbols) - reductionSymbols->push_back(symbol); - mlir::Value symVal = converter.getSymbolAddress(*symbol); - if (auto declOp = symVal.getDefiningOp()) - symVal = declOp.getBase(); - mlir::Type redType = - symVal.getType().cast().getEleTy(); - reductionVars.push_back(symVal); - if (redType.isa()) - decl = createReductionDecl( - firOpBuilder, - getReductionName(intrinsicOp, firOpBuilder.getI1Type()), - redId, redType, currentLocation); - else if (redType.isIntOrIndexOrFloat()) { - decl = createReductionDecl(firOpBuilder, - getReductionName(intrinsicOp, redType), - redId, redType, currentLocation); - } else { - TODO(currentLocation, "Reduction of some types is not supported"); - } - reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get( - firOpBuilder.getContext(), decl.getSymName())); - } - } - } - } else if (const auto *reductionIntrinsic = - std::get_if( - &redOperator.u)) { - if (ReductionProcessor::supportedIntrinsicProcReduction( - *reductionIntrinsic)) { - ReductionProcessor::ReductionIdentifier redId = - ReductionProcessor::getReductionType(*reductionIntrinsic); - for (const Fortran::parser::OmpObject &ompObject : objectList.v) { - if (const auto *name{ - Fortran::parser::Unwrap(ompObject)}) { - if (const Fortran::semantics::Symbol * symbol{name->symbol}) { - if (reductionSymbols) - reductionSymbols->push_back(symbol); - mlir::Value symVal = converter.getSymbolAddress(*symbol); - if (auto declOp = symVal.getDefiningOp()) - symVal = declOp.getBase(); - mlir::Type redType = - symVal.getType().cast().getEleTy(); - reductionVars.push_back(symVal); - assert(redType.isIntOrIndexOrFloat() && - "Unsupported reduction type"); - decl = createReductionDecl( - firOpBuilder, - getReductionName(getRealName(*reductionIntrinsic).ToString(), - redType), - redId, redType, currentLocation); - reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get( - firOpBuilder.getContext(), decl.getSymName())); - } - } - } - } - } - } -}; - -static mlir::omp::ScheduleModifier -translateScheduleModifier(const Fortran::parser::OmpScheduleModifierType &m) { - switch (m.v) { - case Fortran::parser::OmpScheduleModifierType::ModType::Monotonic: - return mlir::omp::ScheduleModifier::monotonic; - case Fortran::parser::OmpScheduleModifierType::ModType::Nonmonotonic: - return mlir::omp::ScheduleModifier::nonmonotonic; - case Fortran::parser::OmpScheduleModifierType::ModType::Simd: - return mlir::omp::ScheduleModifier::simd; - } - return mlir::omp::ScheduleModifier::none; -} - -static mlir::omp::ScheduleModifier -getScheduleModifier(const Fortran::parser::OmpScheduleClause &x) { - const auto &modifier = - std::get>(x.t); - // The input may have the modifier any order, so we look for one that isn't - // SIMD. If modifier is not set at all, fall down to the bottom and return - // "none". - if (modifier) { - const auto &modType1 = - std::get(modifier->t); - if (modType1.v.v == - Fortran::parser::OmpScheduleModifierType::ModType::Simd) { - const auto &modType2 = std::get< - std::optional>( - modifier->t); - if (modType2 && - modType2->v.v != - Fortran::parser::OmpScheduleModifierType::ModType::Simd) - return translateScheduleModifier(modType2->v); - - return mlir::omp::ScheduleModifier::none; - } - - return translateScheduleModifier(modType1.v); - } - return mlir::omp::ScheduleModifier::none; -} - -static mlir::omp::ScheduleModifier -getSimdModifier(const Fortran::parser::OmpScheduleClause &x) { - const auto &modifier = - std::get>(x.t); - // Either of the two possible modifiers in the input can be the SIMD modifier, - // so look in either one, and return simd if we find one. Not found = return - // "none". - if (modifier) { - const auto &modType1 = - std::get(modifier->t); - if (modType1.v.v == Fortran::parser::OmpScheduleModifierType::ModType::Simd) - return mlir::omp::ScheduleModifier::simd; - - const auto &modType2 = std::get< - std::optional>( - modifier->t); - if (modType2 && modType2->v.v == - Fortran::parser::OmpScheduleModifierType::ModType::Simd) - return mlir::omp::ScheduleModifier::simd; - } - return mlir::omp::ScheduleModifier::none; -} - -static void -genAllocateClause(Fortran::lower::AbstractConverter &converter, - const Fortran::parser::OmpAllocateClause &ompAllocateClause, - llvm::SmallVectorImpl &allocatorOperands, - llvm::SmallVectorImpl &allocateOperands) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::Location currentLocation = converter.getCurrentLocation(); - Fortran::lower::StatementContext stmtCtx; - - mlir::Value allocatorOperand; - const Fortran::parser::OmpObjectList &ompObjectList = - std::get(ompAllocateClause.t); - const auto &allocateModifier = std::get< - std::optional>( - ompAllocateClause.t); - - // If the allocate modifier is present, check if we only use the allocator - // submodifier. ALIGN in this context is unimplemented - const bool onlyAllocator = - allocateModifier && - std::holds_alternative< - Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>( - allocateModifier->u); - - if (allocateModifier && !onlyAllocator) { - TODO(currentLocation, "OmpAllocateClause ALIGN modifier"); - } - - // Check if allocate clause has allocator specified. If so, add it - // to list of allocators, otherwise, add default allocator to - // list of allocators. - if (onlyAllocator) { - const auto &allocatorValue = std::get< - Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>( - allocateModifier->u); - allocatorOperand = fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(allocatorValue.v), stmtCtx)); - allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(), - allocatorOperand); - } else { - allocatorOperand = firOpBuilder.createIntegerConstant( - currentLocation, firOpBuilder.getI32Type(), 1); - allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(), - allocatorOperand); - } - genObjectList(ompObjectList, converter, allocateOperands); -} - -static mlir::omp::ClauseProcBindKindAttr genProcBindKindAttr( - fir::FirOpBuilder &firOpBuilder, - const Fortran::parser::OmpClause::ProcBind *procBindClause) { - mlir::omp::ClauseProcBindKind procBindKind; - switch (procBindClause->v.v) { - case Fortran::parser::OmpProcBindClause::Type::Master: - procBindKind = mlir::omp::ClauseProcBindKind::Master; - break; - case Fortran::parser::OmpProcBindClause::Type::Close: - procBindKind = mlir::omp::ClauseProcBindKind::Close; - break; - case Fortran::parser::OmpProcBindClause::Type::Spread: - procBindKind = mlir::omp::ClauseProcBindKind::Spread; - break; - case Fortran::parser::OmpProcBindClause::Type::Primary: - procBindKind = mlir::omp::ClauseProcBindKind::Primary; - break; - } - return mlir::omp::ClauseProcBindKindAttr::get(firOpBuilder.getContext(), - procBindKind); -} - -static mlir::omp::ClauseTaskDependAttr -genDependKindAttr(fir::FirOpBuilder &firOpBuilder, - const Fortran::parser::OmpClause::Depend *dependClause) { - mlir::omp::ClauseTaskDepend pbKind; - switch ( - std::get( - std::get(dependClause->v.u) - .t) - .v) { - case Fortran::parser::OmpDependenceType::Type::In: - pbKind = mlir::omp::ClauseTaskDepend::taskdependin; - break; - case Fortran::parser::OmpDependenceType::Type::Out: - pbKind = mlir::omp::ClauseTaskDepend::taskdependout; - break; - case Fortran::parser::OmpDependenceType::Type::Inout: - pbKind = mlir::omp::ClauseTaskDepend::taskdependinout; - break; - default: - llvm_unreachable("unknown parser task dependence type"); - break; - } - return mlir::omp::ClauseTaskDependAttr::get(firOpBuilder.getContext(), - pbKind); -} - -static mlir::Value getIfClauseOperand( - Fortran::lower::AbstractConverter &converter, - const Fortran::parser::OmpClause::If *ifClause, - Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName, - mlir::Location clauseLocation) { - // Only consider the clause if it's intended for the given directive. - auto &directive = std::get< - std::optional>( - ifClause->v.t); - if (directive && directive.value() != directiveName) - return nullptr; - - Fortran::lower::StatementContext stmtCtx; - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - auto &expr = std::get(ifClause->v.t); - mlir::Value ifVal = fir::getBase( - converter.genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)); - return firOpBuilder.createConvert(clauseLocation, firOpBuilder.getI1Type(), - ifVal); -} - -static void -addUseDeviceClause(Fortran::lower::AbstractConverter &converter, - const Fortran::parser::OmpObjectList &useDeviceClause, - llvm::SmallVectorImpl &operands, - llvm::SmallVectorImpl &useDeviceTypes, - llvm::SmallVectorImpl &useDeviceLocs, - llvm::SmallVectorImpl - &useDeviceSymbols) { - genObjectList(useDeviceClause, converter, operands); - for (mlir::Value &operand : operands) { - checkMapType(operand.getLoc(), operand.getType()); - useDeviceTypes.push_back(operand.getType()); - useDeviceLocs.push_back(operand.getLoc()); - } - for (const Fortran::parser::OmpObject &ompObject : useDeviceClause.v) { - Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); - useDeviceSymbols.push_back(sym); - } -} - -//===----------------------------------------------------------------------===// -// ClauseProcessor unique clauses -//===----------------------------------------------------------------------===// - -bool ClauseProcessor::processCollapse( - mlir::Location currentLocation, Fortran::lower::pft::Evaluation &eval, - llvm::SmallVectorImpl &lowerBound, - llvm::SmallVectorImpl &upperBound, - llvm::SmallVectorImpl &step, - llvm::SmallVectorImpl &iv, - std::size_t &loopVarTypeSize) const { - bool found = false; - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - - // Collect the loops to collapse. - Fortran::lower::pft::Evaluation *doConstructEval = - &eval.getFirstNestedEvaluation(); - if (doConstructEval->getIf() - ->IsDoConcurrent()) { - TODO(currentLocation, "Do Concurrent in Worksharing loop construct"); - } - - std::int64_t collapseValue = 1l; - if (auto *collapseClause = findUniqueClause()) { - const auto *expr = Fortran::semantics::GetExpr(collapseClause->v); - collapseValue = Fortran::evaluate::ToInt64(*expr).value(); - found = true; - } - - loopVarTypeSize = 0; - do { - Fortran::lower::pft::Evaluation *doLoop = - &doConstructEval->getFirstNestedEvaluation(); - auto *doStmt = doLoop->getIf(); - assert(doStmt && "Expected do loop to be in the nested evaluation"); - const auto &loopControl = - std::get>(doStmt->t); - const Fortran::parser::LoopControl::Bounds *bounds = - std::get_if(&loopControl->u); - assert(bounds && "Expected bounds for worksharing do loop"); - Fortran::lower::StatementContext stmtCtx; - lowerBound.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); - upperBound.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); - if (bounds->step) { - step.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); - } else { // If `step` is not present, assume it as `1`. - step.push_back(firOpBuilder.createIntegerConstant( - currentLocation, firOpBuilder.getIntegerType(32), 1)); - } - iv.push_back(bounds->name.thing.symbol); - loopVarTypeSize = std::max(loopVarTypeSize, - bounds->name.thing.symbol->GetUltimate().size()); - collapseValue--; - doConstructEval = - &*std::next(doConstructEval->getNestedEvaluations().begin()); - } while (collapseValue > 0); - - return found; -} - -bool ClauseProcessor::processDefault() const { - if (auto *defaultClause = findUniqueClause()) { - // Private, Firstprivate, Shared, None - switch (defaultClause->v.v) { - case Fortran::parser::OmpDefaultClause::Type::Shared: - case Fortran::parser::OmpDefaultClause::Type::None: - // Default clause with shared or none do not require any handling since - // Shared is the default behavior in the IR and None is only required - // for semantic checks. - break; - case Fortran::parser::OmpDefaultClause::Type::Private: - // TODO Support default(private) - break; - case Fortran::parser::OmpDefaultClause::Type::Firstprivate: - // TODO Support default(firstprivate) - break; - } - return true; - } - return false; -} - -bool ClauseProcessor::processDevice(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const { - const Fortran::parser::CharBlock *source = nullptr; - if (auto *deviceClause = findUniqueClause(&source)) { - mlir::Location clauseLocation = converter.genLocation(*source); - if (auto deviceModifier = std::get< - std::optional>( - deviceClause->v.t)) { - if (deviceModifier == - Fortran::parser::OmpDeviceClause::DeviceModifier::Ancestor) { - TODO(clauseLocation, "OMPD_target Device Modifier Ancestor"); - } - } - if (const auto *deviceExpr = Fortran::semantics::GetExpr( - std::get(deviceClause->v.t))) { - result = fir::getBase(converter.genExprValue(*deviceExpr, stmtCtx)); - } - return true; - } - return false; -} - -bool ClauseProcessor::processDeviceType( - mlir::omp::DeclareTargetDeviceType &result) const { - if (auto *deviceTypeClause = findUniqueClause()) { - // Case: declare target ... device_type(any | host | nohost) - switch (deviceTypeClause->v.v) { - case Fortran::parser::OmpDeviceTypeClause::Type::Nohost: - result = mlir::omp::DeclareTargetDeviceType::nohost; - break; - case Fortran::parser::OmpDeviceTypeClause::Type::Host: - result = mlir::omp::DeclareTargetDeviceType::host; - break; - case Fortran::parser::OmpDeviceTypeClause::Type::Any: - result = mlir::omp::DeclareTargetDeviceType::any; - break; - } - return true; - } - return false; -} - -bool ClauseProcessor::processFinal(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const { - const Fortran::parser::CharBlock *source = nullptr; - if (auto *finalClause = findUniqueClause(&source)) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::Location clauseLocation = converter.genLocation(*source); - - mlir::Value finalVal = fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(finalClause->v), stmtCtx)); - result = firOpBuilder.createConvert(clauseLocation, - firOpBuilder.getI1Type(), finalVal); - return true; - } - return false; -} - -bool ClauseProcessor::processHint(mlir::IntegerAttr &result) const { - if (auto *hintClause = findUniqueClause()) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - const auto *expr = Fortran::semantics::GetExpr(hintClause->v); - int64_t hintValue = *Fortran::evaluate::ToInt64(*expr); - result = firOpBuilder.getI64IntegerAttr(hintValue); - return true; - } - return false; -} - -bool ClauseProcessor::processMergeable(mlir::UnitAttr &result) const { - return markClauseOccurrence(result); -} - -bool ClauseProcessor::processNowait(mlir::UnitAttr &result) const { - return markClauseOccurrence(result); -} - -bool ClauseProcessor::processNumTeams(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const { - // TODO Get lower and upper bounds for num_teams when parser is updated to - // accept both. - if (auto *numTeamsClause = findUniqueClause()) { - result = fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(numTeamsClause->v), stmtCtx)); - return true; - } - return false; -} - -bool ClauseProcessor::processNumThreads( - Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const { - if (auto *numThreadsClause = findUniqueClause()) { - // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`. - result = fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx)); - return true; - } - return false; -} - -bool ClauseProcessor::processOrdered(mlir::IntegerAttr &result) const { - if (auto *orderedClause = findUniqueClause()) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - int64_t orderedClauseValue = 0l; - if (orderedClause->v.has_value()) { - const auto *expr = Fortran::semantics::GetExpr(orderedClause->v); - orderedClauseValue = *Fortran::evaluate::ToInt64(*expr); - } - result = firOpBuilder.getI64IntegerAttr(orderedClauseValue); - return true; - } - return false; -} - -bool ClauseProcessor::processPriority(Fortran::lower::StatementContext &stmtCtx, - mlir::Value &result) const { - if (auto *priorityClause = findUniqueClause()) { - result = fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(priorityClause->v), stmtCtx)); - return true; - } - return false; -} - -bool ClauseProcessor::processProcBind( - mlir::omp::ClauseProcBindKindAttr &result) const { - if (auto *procBindClause = findUniqueClause()) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - result = genProcBindKindAttr(firOpBuilder, procBindClause); - return true; - } - return false; -} - -bool ClauseProcessor::processSafelen(mlir::IntegerAttr &result) const { - if (auto *safelenClause = findUniqueClause()) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - const auto *expr = Fortran::semantics::GetExpr(safelenClause->v); - const std::optional safelenVal = - Fortran::evaluate::ToInt64(*expr); - result = firOpBuilder.getI64IntegerAttr(*safelenVal); - return true; - } - return false; -} - -bool ClauseProcessor::processSchedule( - mlir::omp::ClauseScheduleKindAttr &valAttr, - mlir::omp::ScheduleModifierAttr &modifierAttr, - mlir::UnitAttr &simdModifierAttr) const { - if (auto *scheduleClause = findUniqueClause()) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::MLIRContext *context = firOpBuilder.getContext(); - const Fortran::parser::OmpScheduleClause &scheduleType = scheduleClause->v; - const auto &scheduleClauseKind = - std::get( - scheduleType.t); - - mlir::omp::ClauseScheduleKind scheduleKind; - switch (scheduleClauseKind) { - case Fortran::parser::OmpScheduleClause::ScheduleType::Static: - scheduleKind = mlir::omp::ClauseScheduleKind::Static; - break; - case Fortran::parser::OmpScheduleClause::ScheduleType::Dynamic: - scheduleKind = mlir::omp::ClauseScheduleKind::Dynamic; - break; - case Fortran::parser::OmpScheduleClause::ScheduleType::Guided: - scheduleKind = mlir::omp::ClauseScheduleKind::Guided; - break; - case Fortran::parser::OmpScheduleClause::ScheduleType::Auto: - scheduleKind = mlir::omp::ClauseScheduleKind::Auto; - break; - case Fortran::parser::OmpScheduleClause::ScheduleType::Runtime: - scheduleKind = mlir::omp::ClauseScheduleKind::Runtime; - break; - } - - mlir::omp::ScheduleModifier scheduleModifier = - getScheduleModifier(scheduleClause->v); - - if (scheduleModifier != mlir::omp::ScheduleModifier::none) - modifierAttr = - mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier); - - if (getSimdModifier(scheduleClause->v) != mlir::omp::ScheduleModifier::none) - simdModifierAttr = firOpBuilder.getUnitAttr(); - - valAttr = mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind); - return true; - } - return false; -} - -bool ClauseProcessor::processScheduleChunk( - Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const { - if (auto *scheduleClause = findUniqueClause()) { - if (const auto &chunkExpr = - std::get>( - scheduleClause->v.t)) { - if (const auto *expr = Fortran::semantics::GetExpr(*chunkExpr)) { - result = fir::getBase(converter.genExprValue(*expr, stmtCtx)); - } - } - return true; - } - return false; -} - -bool ClauseProcessor::processSimdlen(mlir::IntegerAttr &result) const { - if (auto *simdlenClause = findUniqueClause()) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - const auto *expr = Fortran::semantics::GetExpr(simdlenClause->v); - const std::optional simdlenVal = - Fortran::evaluate::ToInt64(*expr); - result = firOpBuilder.getI64IntegerAttr(*simdlenVal); - return true; - } - return false; -} - -bool ClauseProcessor::processThreadLimit( - Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const { - if (auto *threadLmtClause = findUniqueClause()) { - result = fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(threadLmtClause->v), stmtCtx)); - return true; - } - return false; -} - -bool ClauseProcessor::processUntied(mlir::UnitAttr &result) const { - return markClauseOccurrence(result); -} - -//===----------------------------------------------------------------------===// -// ClauseProcessor repeatable clauses -//===----------------------------------------------------------------------===// - -bool ClauseProcessor::processAllocate( - llvm::SmallVectorImpl &allocatorOperands, - llvm::SmallVectorImpl &allocateOperands) const { - return findRepeatableClause( - [&](const ClauseTy::Allocate *allocateClause, - const Fortran::parser::CharBlock &) { - genAllocateClause(converter, allocateClause->v, allocatorOperands, - allocateOperands); - }); -} - -bool ClauseProcessor::processCopyin() const { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::OpBuilder::InsertPoint insPt = firOpBuilder.saveInsertionPoint(); - firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); - auto checkAndCopyHostAssociateVar = - [&](Fortran::semantics::Symbol *sym, - mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) { - assert(sym->has() && - "No host-association found"); - if (converter.isPresentShallowLookup(*sym)) - converter.copyHostAssociateVar(*sym, copyAssignIP); - }; - bool hasCopyin = findRepeatableClause( - [&](const ClauseTy::Copyin *copyinClause, - const Fortran::parser::CharBlock &) { - const Fortran::parser::OmpObjectList &ompObjectList = copyinClause->v; - for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) { - Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); - if (const auto *commonDetails = - sym->detailsIf()) { - for (const auto &mem : commonDetails->objects()) - checkAndCopyHostAssociateVar(&*mem, &insPt); - break; - } - if (Fortran::semantics::IsAllocatableOrObjectPointer( - &sym->GetUltimate())) - TODO(converter.getCurrentLocation(), - "pointer or allocatable variables in Copyin clause"); - assert(sym->has() && - "No host-association found"); - checkAndCopyHostAssociateVar(sym); - } - }); - - // [OMP 5.0, 2.19.6.1] The copy is done after the team is formed and prior to - // the execution of the associated structured block. Emit implicit barrier to - // synchronize threads and avoid data races on propagation master's thread - // values of threadprivate variables to local instances of that variables of - // all other implicit threads. - if (hasCopyin) - firOpBuilder.create(converter.getCurrentLocation()); - firOpBuilder.restoreInsertionPoint(insPt); - return hasCopyin; -} - -bool ClauseProcessor::processDepend( - llvm::SmallVectorImpl &dependTypeOperands, - llvm::SmallVectorImpl &dependOperands) const { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - - return findRepeatableClause( - [&](const ClauseTy::Depend *dependClause, - const Fortran::parser::CharBlock &) { - const std::list &depVal = - std::get>( - std::get( - dependClause->v.u) - .t); - mlir::omp::ClauseTaskDependAttr dependTypeOperand = - genDependKindAttr(firOpBuilder, dependClause); - dependTypeOperands.insert(dependTypeOperands.end(), depVal.size(), - dependTypeOperand); - for (const Fortran::parser::Designator &ompObject : depVal) { - Fortran::semantics::Symbol *sym = nullptr; - std::visit( - Fortran::common::visitors{ - [&](const Fortran::parser::DataRef &designator) { - if (const Fortran::parser::Name *name = - std::get_if(&designator.u)) { - sym = name->symbol; - } else if (std::get_if>( - &designator.u)) { - TODO(converter.getCurrentLocation(), - "array sections not supported for task depend"); - } - }, - [&](const Fortran::parser::Substring &designator) { - TODO(converter.getCurrentLocation(), - "substring not supported for task depend"); - }}, - (ompObject).u); - const mlir::Value variable = converter.getSymbolAddress(*sym); - dependOperands.push_back(variable); - } - }); -} - -bool ClauseProcessor::processIf( - Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName, - mlir::Value &result) const { - bool found = false; - findRepeatableClause( - [&](const ClauseTy::If *ifClause, - const Fortran::parser::CharBlock &source) { - mlir::Location clauseLocation = converter.genLocation(source); - mlir::Value operand = getIfClauseOperand(converter, ifClause, - directiveName, clauseLocation); - // Assume that, at most, a single 'if' clause will be applicable to the - // given directive. - if (operand) { - result = operand; - found = true; - } - }); - return found; -} - -bool ClauseProcessor::processLink( - llvm::SmallVectorImpl &result) const { - return findRepeatableClause( - [&](const ClauseTy::Link *linkClause, - const Fortran::parser::CharBlock &) { - // Case: declare target link(var1, var2)... - gatherFuncAndVarSyms( - linkClause->v, mlir::omp::DeclareTargetCaptureClause::link, result); - }); -} - -static mlir::omp::MapInfoOp -createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, - mlir::SmallVector bounds, - mlir::SmallVector members, uint64_t mapType, - mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, - bool isVal = false) { - if (auto boxTy = baseAddr.getType().dyn_cast()) { - baseAddr = builder.create(loc, baseAddr); - retTy = baseAddr.getType(); - } - - mlir::TypeAttr varType = mlir::TypeAttr::get( - llvm::cast(retTy).getElementType()); - - mlir::omp::MapInfoOp op = builder.create( - loc, retTy, baseAddr, varType, varPtrPtr, members, bounds, - builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), - builder.getAttr(mapCaptureType), - builder.getStringAttr(name)); - - return op; -} - -bool ClauseProcessor::processMap( - mlir::Location currentLocation, const llvm::omp::Directive &directive, - Fortran::lower::StatementContext &stmtCtx, - llvm::SmallVectorImpl &mapOperands, - llvm::SmallVectorImpl *mapSymTypes, - llvm::SmallVectorImpl *mapSymLocs, - llvm::SmallVectorImpl *mapSymbols) - const { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - return findRepeatableClause( - [&](const ClauseTy::Map *mapClause, - const Fortran::parser::CharBlock &source) { - mlir::Location clauseLocation = converter.genLocation(source); - const auto &oMapType = - std::get>( - mapClause->v.t); - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; - // If the map type is specified, then process it else Tofrom is the - // default. - if (oMapType) { - const Fortran::parser::OmpMapType::Type &mapType = - std::get(oMapType->t); - switch (mapType) { - case Fortran::parser::OmpMapType::Type::To: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - break; - case Fortran::parser::OmpMapType::Type::From: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - break; - case Fortran::parser::OmpMapType::Type::Tofrom: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - break; - case Fortran::parser::OmpMapType::Type::Alloc: - case Fortran::parser::OmpMapType::Type::Release: - // alloc and release is the default map_type for the Target Data - // Ops, i.e. if no bits for map_type is supplied then alloc/release - // is implicitly assumed based on the target directive. Default - // value for Target Data and Enter Data is alloc and for Exit Data - // it is release. - break; - case Fortran::parser::OmpMapType::Type::Delete: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; - } - - if (std::get>( - oMapType->t)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; - } else { - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - } - - for (const Fortran::parser::OmpObject &ompObject : - std::get(mapClause->v.t).v) { - llvm::SmallVector bounds; - std::stringstream asFortran; - - Fortran::lower::AddrAndBoundsInfo info = - Fortran::lower::gatherDataOperandAddrAndBounds< - Fortran::parser::OmpObject, mlir::omp::DataBoundsOp, - mlir::omp::DataBoundsType>( - converter, firOpBuilder, semaCtx, stmtCtx, ompObject, - clauseLocation, asFortran, bounds, treatIndexAsSection); - - auto origSymbol = - converter.getSymbolAddress(*getOmpObjectSymbol(ompObject)); - mlir::Value symAddr = info.addr; - if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) - symAddr = origSymbol; - - // Explicit map captures are captured ByRef by default, - // optimisation passes may alter this to ByCopy or other capture - // types to optimise - mlir::Value mapOp = createMapInfoOp( - firOpBuilder, clauseLocation, symAddr, mlir::Value{}, - asFortran.str(), bounds, {}, - static_cast< - std::underlying_type_t>( - mapTypeBits), - mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); - - mapOperands.push_back(mapOp); - if (mapSymTypes) - mapSymTypes->push_back(symAddr.getType()); - if (mapSymLocs) - mapSymLocs->push_back(symAddr.getLoc()); - - if (mapSymbols) - mapSymbols->push_back(getOmpObjectSymbol(ompObject)); - } - }); -} - -bool ClauseProcessor::processReduction( - mlir::Location currentLocation, - llvm::SmallVectorImpl &reductionVars, - llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl *reductionSymbols) - const { - return findRepeatableClause( - [&](const ClauseTy::Reduction *reductionClause, - const Fortran::parser::CharBlock &) { - ReductionProcessor rp; - rp.addReductionDecl(currentLocation, converter, reductionClause->v, - reductionVars, reductionDeclSymbols, - reductionSymbols); - }); -} - -bool ClauseProcessor::processSectionsReduction( - mlir::Location currentLocation) const { - return findRepeatableClause( - [&](const ClauseTy::Reduction *, const Fortran::parser::CharBlock &) { - TODO(currentLocation, "OMPC_Reduction"); - }); -} - -bool ClauseProcessor::processTo( - llvm::SmallVectorImpl &result) const { - return findRepeatableClause( - [&](const ClauseTy::To *toClause, const Fortran::parser::CharBlock &) { - // Case: declare target to(func, var1, var2)... - gatherFuncAndVarSyms(toClause->v, - mlir::omp::DeclareTargetCaptureClause::to, result); - }); -} - -bool ClauseProcessor::processEnter( - llvm::SmallVectorImpl &result) const { - return findRepeatableClause( - [&](const ClauseTy::Enter *enterClause, - const Fortran::parser::CharBlock &) { - // Case: declare target enter(func, var1, var2)... - gatherFuncAndVarSyms(enterClause->v, - mlir::omp::DeclareTargetCaptureClause::enter, - result); - }); -} - -bool ClauseProcessor::processUseDeviceAddr( - llvm::SmallVectorImpl &operands, - llvm::SmallVectorImpl &useDeviceTypes, - llvm::SmallVectorImpl &useDeviceLocs, - llvm::SmallVectorImpl &useDeviceSymbols) - const { - return findRepeatableClause( - [&](const ClauseTy::UseDeviceAddr *devAddrClause, - const Fortran::parser::CharBlock &) { - addUseDeviceClause(converter, devAddrClause->v, operands, - useDeviceTypes, useDeviceLocs, useDeviceSymbols); - }); -} - -bool ClauseProcessor::processUseDevicePtr( - llvm::SmallVectorImpl &operands, - llvm::SmallVectorImpl &useDeviceTypes, - llvm::SmallVectorImpl &useDeviceLocs, - llvm::SmallVectorImpl &useDeviceSymbols) - const { - return findRepeatableClause( - [&](const ClauseTy::UseDevicePtr *devPtrClause, - const Fortran::parser::CharBlock &) { - addUseDeviceClause(converter, devPtrClause->v, operands, useDeviceTypes, - useDeviceLocs, useDeviceSymbols); - }); -} - -template -bool ClauseProcessor::processMotionClauses( - Fortran::lower::StatementContext &stmtCtx, - llvm::SmallVectorImpl &mapOperands) { - return findRepeatableClause( - [&](const T *motionClause, const Fortran::parser::CharBlock &source) { - mlir::Location clauseLocation = converter.genLocation(source); - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - - static_assert(std::is_same_v || - std::is_same_v); - - // TODO Support motion modifiers: present, mapper, iterator. - constexpr llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - std::is_same_v - ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO - : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - - for (const Fortran::parser::OmpObject &ompObject : motionClause->v.v) { - llvm::SmallVector bounds; - std::stringstream asFortran; - Fortran::lower::AddrAndBoundsInfo info = - Fortran::lower::gatherDataOperandAddrAndBounds< - Fortran::parser::OmpObject, mlir::omp::DataBoundsOp, - mlir::omp::DataBoundsType>( - converter, firOpBuilder, semaCtx, stmtCtx, ompObject, - clauseLocation, asFortran, bounds, treatIndexAsSection); - - auto origSymbol = - converter.getSymbolAddress(*getOmpObjectSymbol(ompObject)); - mlir::Value symAddr = info.addr; - if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) - symAddr = origSymbol; - - // Explicit map captures are captured ByRef by default, - // optimisation passes may alter this to ByCopy or other capture - // types to optimise - mlir::Value mapOp = createMapInfoOp( - firOpBuilder, clauseLocation, symAddr, mlir::Value{}, - asFortran.str(), bounds, {}, - static_cast< - std::underlying_type_t>( - mapTypeBits), - mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); - - mapOperands.push_back(mapOp); - } - }); -} - -template -void ClauseProcessor::processTODO(mlir::Location currentLocation, - llvm::omp::Directive directive) const { - auto checkUnhandledClause = [&](const auto *x) { - if (!x) - return; - TODO(currentLocation, - "Unhandled clause " + - llvm::StringRef(Fortran::parser::ParseTreeDumper::GetNodeName(*x)) - .upper() + - " in " + llvm::omp::getOpenMPDirectiveName(directive).upper() + - " construct"); - }; - - for (ClauseIterator it = clauses.v.begin(); it != clauses.v.end(); ++it) - (checkUnhandledClause(std::get_if(&it->u)), ...); -} - -//===----------------------------------------------------------------------===// -// Code generation helper functions -//===----------------------------------------------------------------------===// - static fir::GlobalOp globalInitialization( Fortran::lower::AbstractConverter &converter, fir::FirOpBuilder &firOpBuilder, const Fortran::semantics::Symbol &sym, diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp new file mode 100644 index 0000000000000..a8b98f3f56724 --- /dev/null +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -0,0 +1,431 @@ +//===-- ReductionProcessor.cpp ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#include "ReductionProcessor.h" + +#include "flang/Lower/AbstractConverter.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Parser/tools.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" + +namespace Fortran { +namespace lower { +namespace omp { + +ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType( + const Fortran::parser::ProcedureDesignator &pd) { + auto redType = llvm::StringSwitch>( + ReductionProcessor::getRealName(pd).ToString()) + .Case("max", ReductionIdentifier::MAX) + .Case("min", ReductionIdentifier::MIN) + .Case("iand", ReductionIdentifier::IAND) + .Case("ior", ReductionIdentifier::IOR) + .Case("ieor", ReductionIdentifier::IEOR) + .Default(std::nullopt); + assert(redType && "Invalid Reduction"); + return *redType; +} + +ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType( + Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) { + switch (intrinsicOp) { + case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: + return ReductionIdentifier::ADD; + case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract: + return ReductionIdentifier::SUBTRACT; + case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: + return ReductionIdentifier::MULTIPLY; + case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: + return ReductionIdentifier::AND; + case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: + return ReductionIdentifier::EQV; + case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: + return ReductionIdentifier::OR; + case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: + return ReductionIdentifier::NEQV; + default: + llvm_unreachable("unexpected intrinsic operator in reduction"); + } +} + +bool ReductionProcessor::supportedIntrinsicProcReduction( + const Fortran::parser::ProcedureDesignator &pd) { + const auto *name{Fortran::parser::Unwrap(pd)}; + assert(name && "Invalid Reduction Intrinsic."); + if (!name->symbol->GetUltimate().attrs().test( + Fortran::semantics::Attr::INTRINSIC)) + return false; + auto redType = llvm::StringSwitch(getRealName(name).ToString()) + .Case("max", true) + .Case("min", true) + .Case("iand", true) + .Case("ior", true) + .Case("ieor", true) + .Default(false); + return redType; +} + +std::string ReductionProcessor::getReductionName(llvm::StringRef name, + mlir::Type ty) { + return (llvm::Twine(name) + + (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) + + llvm::Twine(ty.getIntOrFloatBitWidth())) + .str(); +} + +std::string ReductionProcessor::getReductionName( + Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp, + mlir::Type ty) { + std::string reductionName; + + switch (intrinsicOp) { + case Fortran::parser::DefinedOperator::IntrinsicOperator::Add: + reductionName = "add_reduction"; + break; + case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply: + reductionName = "multiply_reduction"; + break; + case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: + return "and_reduction"; + case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: + return "eqv_reduction"; + case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: + return "or_reduction"; + case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: + return "neqv_reduction"; + default: + reductionName = "other_reduction"; + break; + } + + return getReductionName(reductionName, ty); +} + +mlir::Value +ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type, + ReductionIdentifier redId, + fir::FirOpBuilder &builder) { + assert((fir::isa_integer(type) || fir::isa_real(type) || + type.isa()) && + "only integer, logical and real types are currently supported"); + switch (redId) { + case ReductionIdentifier::MAX: { + if (auto ty = type.dyn_cast()) { + const llvm::fltSemantics &sem = ty.getFloatSemantics(); + return builder.createRealConstant( + loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/true)); + } + unsigned bits = type.getIntOrFloatBitWidth(); + int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue(); + return builder.createIntegerConstant(loc, type, minInt); + } + case ReductionIdentifier::MIN: { + if (auto ty = type.dyn_cast()) { + const llvm::fltSemantics &sem = ty.getFloatSemantics(); + return builder.createRealConstant( + loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/false)); + } + unsigned bits = type.getIntOrFloatBitWidth(); + int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue(); + return builder.createIntegerConstant(loc, type, maxInt); + } + case ReductionIdentifier::IOR: { + unsigned bits = type.getIntOrFloatBitWidth(); + int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue(); + return builder.createIntegerConstant(loc, type, zeroInt); + } + case ReductionIdentifier::IEOR: { + unsigned bits = type.getIntOrFloatBitWidth(); + int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue(); + return builder.createIntegerConstant(loc, type, zeroInt); + } + case ReductionIdentifier::IAND: { + unsigned bits = type.getIntOrFloatBitWidth(); + int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue(); + return builder.createIntegerConstant(loc, type, allOnInt); + } + case ReductionIdentifier::ADD: + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::OR: + case ReductionIdentifier::EQV: + case ReductionIdentifier::NEQV: + if (type.isa()) + return builder.create( + loc, type, + builder.getFloatAttr(type, (double)getOperationIdentity(redId, loc))); + + if (type.isa()) { + mlir::Value intConst = builder.create( + loc, builder.getI1Type(), + builder.getIntegerAttr(builder.getI1Type(), + getOperationIdentity(redId, loc))); + return builder.createConvert(loc, type, intConst); + } + + return builder.create( + loc, type, + builder.getIntegerAttr(type, getOperationIdentity(redId, loc))); + case ReductionIdentifier::ID: + case ReductionIdentifier::USER_DEF_OP: + case ReductionIdentifier::SUBTRACT: + TODO(loc, "Reduction of some identifier types is not supported"); + } + llvm_unreachable("Unhandled Reduction identifier : getReductionInitValue"); +} + +mlir::Value ReductionProcessor::createScalarCombiner( + fir::FirOpBuilder &builder, mlir::Location loc, ReductionIdentifier redId, + mlir::Type type, mlir::Value op1, mlir::Value op2) { + mlir::Value reductionOp; + switch (redId) { + case ReductionIdentifier::MAX: + reductionOp = + getReductionOperation( + builder, type, loc, op1, op2); + break; + case ReductionIdentifier::MIN: + reductionOp = + getReductionOperation( + builder, type, loc, op1, op2); + break; + case ReductionIdentifier::IOR: + assert((type.isIntOrIndex()) && "only integer is expected"); + reductionOp = builder.create(loc, op1, op2); + break; + case ReductionIdentifier::IEOR: + assert((type.isIntOrIndex()) && "only integer is expected"); + reductionOp = builder.create(loc, op1, op2); + break; + case ReductionIdentifier::IAND: + assert((type.isIntOrIndex()) && "only integer is expected"); + reductionOp = builder.create(loc, op1, op2); + break; + case ReductionIdentifier::ADD: + reductionOp = + getReductionOperation( + builder, type, loc, op1, op2); + break; + case ReductionIdentifier::MULTIPLY: + reductionOp = + getReductionOperation( + builder, type, loc, op1, op2); + break; + case ReductionIdentifier::AND: { + mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); + mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); + + mlir::Value andiOp = builder.create(loc, op1I1, op2I1); + + reductionOp = builder.createConvert(loc, type, andiOp); + break; + } + case ReductionIdentifier::OR: { + mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); + mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); + + mlir::Value oriOp = builder.create(loc, op1I1, op2I1); + + reductionOp = builder.createConvert(loc, type, oriOp); + break; + } + case ReductionIdentifier::EQV: { + mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); + mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); + + mlir::Value cmpiOp = builder.create( + loc, mlir::arith::CmpIPredicate::eq, op1I1, op2I1); + + reductionOp = builder.createConvert(loc, type, cmpiOp); + break; + } + case ReductionIdentifier::NEQV: { + mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1); + mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2); + + mlir::Value cmpiOp = builder.create( + loc, mlir::arith::CmpIPredicate::ne, op1I1, op2I1); + + reductionOp = builder.createConvert(loc, type, cmpiOp); + break; + } + default: + TODO(loc, "Reduction of some intrinsic operators is not supported"); + } + + return reductionOp; +} + +mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl( + fir::FirOpBuilder &builder, llvm::StringRef reductionOpName, + const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) { + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::ModuleOp module = builder.getModule(); + + auto decl = + module.lookupSymbol(reductionOpName); + if (decl) + return decl; + + mlir::OpBuilder modBuilder(module.getBodyRegion()); + + decl = modBuilder.create(loc, reductionOpName, + type); + builder.createBlock(&decl.getInitializerRegion(), + decl.getInitializerRegion().end(), {type}, {loc}); + builder.setInsertionPointToEnd(&decl.getInitializerRegion().back()); + mlir::Value init = getReductionInitValue(loc, type, redId, builder); + builder.create(loc, init); + + builder.createBlock(&decl.getReductionRegion(), + decl.getReductionRegion().end(), {type, type}, + {loc, loc}); + + builder.setInsertionPointToEnd(&decl.getReductionRegion().back()); + mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); + mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); + + mlir::Value reductionOp = + createScalarCombiner(builder, loc, redId, type, op1, op2); + builder.create(loc, reductionOp); + + return decl; +} + +void ReductionProcessor::addReductionDecl( + mlir::Location currentLocation, + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::OmpReductionClause &reduction, + llvm::SmallVectorImpl &reductionVars, + llvm::SmallVectorImpl &reductionDeclSymbols, + llvm::SmallVectorImpl + *reductionSymbols) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::omp::ReductionDeclareOp decl; + const auto &redOperator{ + std::get(reduction.t)}; + const auto &objectList{std::get(reduction.t)}; + if (const auto &redDefinedOp = + std::get_if(&redOperator.u)) { + const auto &intrinsicOp{ + std::get( + redDefinedOp->u)}; + ReductionIdentifier redId = getReductionType(intrinsicOp); + switch (redId) { + case ReductionIdentifier::ADD: + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::EQV: + case ReductionIdentifier::OR: + case ReductionIdentifier::NEQV: + break; + default: + TODO(currentLocation, + "Reduction of some intrinsic operators is not supported"); + break; + } + for (const Fortran::parser::OmpObject &ompObject : objectList.v) { + if (const auto *name{ + Fortran::parser::Unwrap(ompObject)}) { + if (const Fortran::semantics::Symbol * symbol{name->symbol}) { + if (reductionSymbols) + reductionSymbols->push_back(symbol); + mlir::Value symVal = converter.getSymbolAddress(*symbol); + if (auto declOp = symVal.getDefiningOp()) + symVal = declOp.getBase(); + mlir::Type redType = + symVal.getType().cast().getEleTy(); + reductionVars.push_back(symVal); + if (redType.isa()) + decl = createReductionDecl( + firOpBuilder, + getReductionName(intrinsicOp, firOpBuilder.getI1Type()), redId, + redType, currentLocation); + else if (redType.isIntOrIndexOrFloat()) { + decl = createReductionDecl(firOpBuilder, + getReductionName(intrinsicOp, redType), + redId, redType, currentLocation); + } else { + TODO(currentLocation, "Reduction of some types is not supported"); + } + reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get( + firOpBuilder.getContext(), decl.getSymName())); + } + } + } + } else if (const auto *reductionIntrinsic = + std::get_if( + &redOperator.u)) { + if (ReductionProcessor::supportedIntrinsicProcReduction( + *reductionIntrinsic)) { + ReductionProcessor::ReductionIdentifier redId = + ReductionProcessor::getReductionType(*reductionIntrinsic); + for (const Fortran::parser::OmpObject &ompObject : objectList.v) { + if (const auto *name{ + Fortran::parser::Unwrap(ompObject)}) { + if (const Fortran::semantics::Symbol * symbol{name->symbol}) { + if (reductionSymbols) + reductionSymbols->push_back(symbol); + mlir::Value symVal = converter.getSymbolAddress(*symbol); + if (auto declOp = symVal.getDefiningOp()) + symVal = declOp.getBase(); + mlir::Type redType = + symVal.getType().cast().getEleTy(); + reductionVars.push_back(symVal); + assert(redType.isIntOrIndexOrFloat() && + "Unsupported reduction type"); + decl = createReductionDecl( + firOpBuilder, + getReductionName(getRealName(*reductionIntrinsic).ToString(), + redType), + redId, redType, currentLocation); + reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get( + firOpBuilder.getContext(), decl.getSymName())); + } + } + } + } + } +} + +const Fortran::semantics::SourceName +ReductionProcessor::getRealName(const Fortran::parser::Name *name) { + return name->symbol->GetUltimate().name(); +} + +const Fortran::semantics::SourceName ReductionProcessor::getRealName( + const Fortran::parser::ProcedureDesignator &pd) { + const auto *name{Fortran::parser::Unwrap(pd)}; + assert(name && "Invalid Reduction Intrinsic."); + return getRealName(name); +} + +int ReductionProcessor::getOperationIdentity(ReductionIdentifier redId, + mlir::Location loc) { + switch (redId) { + case ReductionIdentifier::ADD: + case ReductionIdentifier::OR: + case ReductionIdentifier::NEQV: + return 0; + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::EQV: + return 1; + default: + TODO(loc, "Reduction of some intrinsic operators is not supported"); + } +} + +} // namespace omp +} // namespace lower +} // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h new file mode 100644 index 0000000000000..00770fe81d1ef --- /dev/null +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h @@ -0,0 +1,138 @@ +//===-- Lower/OpenMP/ReductionProcessor.h -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_LOWER_REDUCTIONPROCESSOR_H +#define FORTRAN_LOWER_REDUCTIONPROCESSOR_H + +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Parser/parse-tree.h" +#include "flang/Semantics/symbol.h" +#include "flang/Semantics/type.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/Types.h" + +namespace mlir { +namespace omp { +class ReductionDeclareOp; +} // namespace omp +} // namespace mlir + +namespace Fortran { +namespace lower { +class AbstractConverter; +} // namespace lower +} // namespace Fortran + +namespace Fortran { +namespace lower { +namespace omp { + +class ReductionProcessor { +public: + // TODO: Move this enumeration to the OpenMP dialect + enum ReductionIdentifier { + ID, + USER_DEF_OP, + ADD, + SUBTRACT, + MULTIPLY, + AND, + OR, + EQV, + NEQV, + MAX, + MIN, + IAND, + IOR, + IEOR + }; + + static ReductionIdentifier + getReductionType(const Fortran::parser::ProcedureDesignator &pd); + + static ReductionIdentifier getReductionType( + Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp); + + static bool supportedIntrinsicProcReduction( + const Fortran::parser::ProcedureDesignator &pd); + + static const Fortran::semantics::SourceName + getRealName(const Fortran::parser::Name *name); + + static const Fortran::semantics::SourceName + getRealName(const Fortran::parser::ProcedureDesignator &pd); + + static std::string getReductionName(llvm::StringRef name, mlir::Type ty); + + static std::string getReductionName( + Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp, + mlir::Type ty); + + /// This function returns the identity value of the operator \p + /// reductionOpName. For example: + /// 0 + x = x, + /// 1 * x = x + static int getOperationIdentity(ReductionIdentifier redId, + mlir::Location loc); + + static mlir::Value getReductionInitValue(mlir::Location loc, mlir::Type type, + ReductionIdentifier redId, + fir::FirOpBuilder &builder); + + template + static mlir::Value getReductionOperation(fir::FirOpBuilder &builder, + mlir::Type type, mlir::Location loc, + mlir::Value op1, mlir::Value op2); + + static mlir::Value createScalarCombiner(fir::FirOpBuilder &builder, + mlir::Location loc, + ReductionIdentifier redId, + mlir::Type type, mlir::Value op1, + mlir::Value op2); + + /// Creates an OpenMP reduction declaration and inserts it into the provided + /// symbol table. The declaration has a constant initializer with the neutral + /// value `initValue`, and the reduction combiner carried over from `reduce`. + /// TODO: Generalize this for non-integer types, add atomic region. + static mlir::omp::ReductionDeclareOp createReductionDecl( + fir::FirOpBuilder &builder, llvm::StringRef reductionOpName, + const ReductionIdentifier redId, mlir::Type type, mlir::Location loc); + + /// Creates a reduction declaration and associates it with an OpenMP block + /// directive. + static void + addReductionDecl(mlir::Location currentLocation, + Fortran::lower::AbstractConverter &converter, + const Fortran::parser::OmpReductionClause &reduction, + llvm::SmallVectorImpl &reductionVars, + llvm::SmallVectorImpl &reductionDeclSymbols, + llvm::SmallVectorImpl + *reductionSymbols = nullptr); +}; + +template +mlir::Value +ReductionProcessor::getReductionOperation(fir::FirOpBuilder &builder, + mlir::Type type, mlir::Location loc, + mlir::Value op1, mlir::Value op2) { + assert(type.isIntOrIndexOrFloat() && + "only integer and float types are currently supported"); + if (type.isIntOrIndex()) + return builder.create(loc, op1, op2); + return builder.create(loc, op1, op2); +} + +} // namespace omp +} // namespace lower +} // namespace Fortran + +#endif // FORTRAN_LOWER_REDUCTIONPROCESSOR_H diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp new file mode 100644 index 0000000000000..31b15257d1868 --- /dev/null +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -0,0 +1,99 @@ +//===-- Utils..cpp ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#include "Utils.h" + +#include +#include +#include +#include +#include +#include + +llvm::cl::opt treatIndexAsSection( + "openmp-treat-index-as-section", + llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."), + llvm::cl::init(true)); + +namespace Fortran { +namespace lower { +namespace omp { + +void genObjectList(const Fortran::parser::OmpObjectList &objectList, + Fortran::lower::AbstractConverter &converter, + llvm::SmallVectorImpl &operands) { + auto addOperands = [&](Fortran::lower::SymbolRef sym) { + const mlir::Value variable = converter.getSymbolAddress(sym); + if (variable) { + operands.push_back(variable); + } else { + if (const auto *details = + sym->detailsIf()) { + operands.push_back(converter.getSymbolAddress(details->symbol())); + converter.copySymbolBinding(details->symbol(), sym); + } + } + }; + for (const Fortran::parser::OmpObject &ompObject : objectList.v) { + Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); + addOperands(*sym); + } +} + +void gatherFuncAndVarSyms( + const Fortran::parser::OmpObjectList &objList, + mlir::omp::DeclareTargetCaptureClause clause, + llvm::SmallVectorImpl &symbolAndClause) { + for (const Fortran::parser::OmpObject &ompObject : objList.v) { + Fortran::common::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::Designator &designator) { + if (const Fortran::parser::Name *name = + Fortran::semantics::getDesignatorNameIfDataRef( + designator)) { + symbolAndClause.emplace_back(clause, *name->symbol); + } + }, + [&](const Fortran::parser::Name &name) { + symbolAndClause.emplace_back(clause, *name.symbol); + }}, + ompObject.u); + } +} + +Fortran::semantics::Symbol * +getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) { + Fortran::semantics::Symbol *sym = nullptr; + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::Designator &designator) { + if (auto *arrayEle = + Fortran::parser::Unwrap( + designator)) { + sym = GetFirstName(arrayEle->base).symbol; + } else if (auto *structComp = Fortran::parser::Unwrap< + Fortran::parser::StructureComponent>(designator)) { + sym = structComp->component.symbol; + } else if (const Fortran::parser::Name *name = + Fortran::semantics::getDesignatorNameIfDataRef( + designator)) { + sym = name->symbol; + } + }, + [&](const Fortran::parser::Name &name) { sym = name.symbol; }}, + ompObject.u); + return sym; +} + +} // namespace omp +} // namespace lower +} // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h new file mode 100644 index 0000000000000..c346f891f0797 --- /dev/null +++ b/flang/lib/Lower/OpenMP/Utils.h @@ -0,0 +1,68 @@ +//===-- Lower/OpenMP/Utils.h ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_LOWER_OPENMPUTILS_H +#define FORTRAN_LOWER_OPENMPUTILS_H + +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/Value.h" +#include "llvm/Support/CommandLine.h" + +extern llvm::cl::opt treatIndexAsSection; + +namespace fir { +class FirOpBuilder; +} // namespace fir + +namespace Fortran { + +namespace semantics { +class Symbol; +} // namespace semantics + +namespace parser { +struct OmpObject; +struct OmpObjectList; +} // namespace parser + +namespace lower { + +class AbstractConverter; + +namespace omp { + +using DeclareTargetCapturePair = + std::pair; + +mlir::omp::MapInfoOp +createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, + mlir::SmallVector bounds, + mlir::SmallVector members, uint64_t mapType, + mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, + bool isVal = false); + +void gatherFuncAndVarSyms( + const Fortran::parser::OmpObjectList &objList, + mlir::omp::DeclareTargetCaptureClause clause, + llvm::SmallVectorImpl &symbolAndClause); + +Fortran::semantics::Symbol * +getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject); + +void genObjectList(const Fortran::parser::OmpObjectList &objectList, + Fortran::lower::AbstractConverter &converter, + llvm::SmallVectorImpl &operands); + +} // namespace omp +} // namespace lower +} // namespace Fortran + +#endif // FORTRAN_LOWER_OPENMPUTILS_H From 5a023f564f9886bcc732147d12e114f5ced92c5d Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Wed, 21 Feb 2024 14:59:47 +0000 Subject: [PATCH 105/351] [AArch64][SVE2] Enable dynamic shuffle for fixed length types. (#72490) When SVE register size is unknown or the minimal size is not equal to the maximum size then we could determine the actual SVE register size in the runtime and adjust shuffle mask in the runtime. --- .../Target/AArch64/AArch64ISelLowering.cpp | 60 ++- .../sve-fixed-length-vector-shuffle-tbl.ll | 407 ++++++++++++++++-- 2 files changed, 432 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8c5a4cdae1163..184ebc19bc9ed 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26798,7 +26798,7 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, // Ignore two operands if no SVE2 or all index numbers couldn't // be represented. - if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize)) + if (!IsSingleOp && !Subtarget.hasSVE2()) return SDValue(); EVT VTOp1 = Op.getOperand(0).getValueType(); @@ -26806,18 +26806,39 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, unsigned IndexLen = MinSVESize / BitsPerElt; unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements(); uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue(); + EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger(); + EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); + bool MinMaxEqual = (MinSVESize == MaxSVESize); assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen && "Incorrectly legalised shuffle operation"); SmallVector TBLMask; + // If MinSVESize is not equal to MaxSVESize then we need to know which + // TBL mask element needs adjustment. + SmallVector AddRuntimeVLMask; + + // Bail out for 8-bits element types, because with 2048-bit SVE register + // size 8 bits is only sufficient to index into the first source vector. + if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8) + return SDValue(); + for (int Index : ShuffleMask) { // Handling poison index value. if (Index < 0) Index = 0; - // If we refer to the second operand then we have to add elements - // number in hardware register minus number of elements in a type. - if ((unsigned)Index >= ElementsPerVectorReg) - Index += IndexLen - ElementsPerVectorReg; + // If the mask refers to elements in the second operand, then we have to + // offset the index by the number of elements in a vector. If this is number + // is not known at compile-time, we need to maintain a mask with 'VL' values + // to add at runtime. + if ((unsigned)Index >= ElementsPerVectorReg) { + if (MinMaxEqual) { + Index += IndexLen - ElementsPerVectorReg; + } else { + Index = Index - ElementsPerVectorReg; + AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64)); + } + } else if (!MinMaxEqual) + AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64)); // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals // to 255, this might point to the last element of in the second operand // of the shufflevector, thus we are rejecting this transform. @@ -26830,11 +26851,12 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, // value where it would perform first lane duplication for out of // index elements. For i8 elements an out-of-range index could be a valid // for 2048-bit vector register size. - for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) + for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) { TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64)); + if (!MinMaxEqual) + AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64)); + } - EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt); - EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType); SDValue VecMask = DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen)); @@ -26846,13 +26868,29 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1, SVEMask); - else if (Subtarget.hasSVE2()) + else if (Subtarget.hasSVE2()) { + if (!MinMaxEqual) { + unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt; + SDValue VScale = (BitsPerElt == 64) + ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts)) + : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts)); + SDValue VecMask = + DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen)); + SDValue MulByMask = DAG.getNode( + ISD::MUL, DL, MaskType, + DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale), + DAG.getBuildVector(MaskType, DL, + ArrayRef(AddRuntimeVLMask.data(), IndexLen))); + SDValue UpdatedVecMask = + DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask); + SVEMask = convertToScalableVector( + DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask); + } Shuffle = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1, Op2, SVEMask); - else - llvm_unreachable("Cannot lower shuffle without SVE2 TBL"); + } Shuffle = convertFromScalableVector(DAG, VT, Shuffle); return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll index bae3c6582c6b0..68c234a20d110 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefixes=CHECK,SVE2_NOMIN_NOMAX +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_MIN_256_NOMAX target triple = "aarch64-unknown-linux-gnu" @@ -16,14 +18,43 @@ target triple = "aarch64-unknown-linux-gnu" ; SVE2_128-NEXT: .byte 255 // 0xff ; SVE2_128-NEXT: .byte 255 // 0xff define <8 x i8> @shuffle_index_indices_from_op1(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_index_indices_from_op1: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; SVE2_128-LABEL: shuffle_index_indices_from_op1: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI0_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_op1: +; SVE2_128_NOMAX: // %bb.0: +; SVE2_128_NOMAX-NEXT: adrp x8, .LCPI0_0 +; SVE2_128_NOMAX-NEXT: ldr d0, [x0] +; SVE2_128_NOMAX-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; SVE2_128_NOMAX-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128_NOMAX-NEXT: ret +; +; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_op1: +; SVE2_NOMIN_NOMAX: // %bb.0: +; SVE2_NOMIN_NOMAX-NEXT: adrp x8, .LCPI0_0 +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x0] +; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; SVE2_NOMIN_NOMAX-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_NOMIN_NOMAX-NEXT: ret +; +; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_op1: +; SVE2_MIN_256_NOMAX: // %bb.0: +; SVE2_MIN_256_NOMAX-NEXT: ptrue p0.b, vl32 +; SVE2_MIN_256_NOMAX-NEXT: adrp x8, .LCPI0_0 +; SVE2_MIN_256_NOMAX-NEXT: add x8, x8, :lo12:.LCPI0_0 +; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0] +; SVE2_MIN_256_NOMAX-NEXT: ld1b { z0.b }, p0/z, [x8] +; SVE2_MIN_256_NOMAX-NEXT: tbl z0.b, { z1.b }, z0.b +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> @@ -42,14 +73,43 @@ define <8 x i8> @shuffle_index_indices_from_op1(ptr %a, ptr %b) { ; SVE2_128-NEXT: .byte 255 // 0xff ; SVE2_128-NEXT: .byte 255 // 0xff define <8 x i8> @shuffle_index_indices_from_op2(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_index_indices_from_op2: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; SVE2_128-LABEL: shuffle_index_indices_from_op2: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI1_0 +; SVE2_128-NEXT: ldr d0, [x1] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_op2: +; SVE2_128_NOMAX: // %bb.0: +; SVE2_128_NOMAX-NEXT: adrp x8, .LCPI1_0 +; SVE2_128_NOMAX-NEXT: ldr d0, [x1] +; SVE2_128_NOMAX-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; SVE2_128_NOMAX-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128_NOMAX-NEXT: ret +; +; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_op2: +; SVE2_NOMIN_NOMAX: // %bb.0: +; SVE2_NOMIN_NOMAX-NEXT: adrp x8, .LCPI1_0 +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] +; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; SVE2_NOMIN_NOMAX-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_NOMIN_NOMAX-NEXT: ret +; +; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_op2: +; SVE2_MIN_256_NOMAX: // %bb.0: +; SVE2_MIN_256_NOMAX-NEXT: ptrue p0.b, vl32 +; SVE2_MIN_256_NOMAX-NEXT: adrp x8, .LCPI1_0 +; SVE2_MIN_256_NOMAX-NEXT: add x8, x8, :lo12:.LCPI1_0 +; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x1] +; SVE2_MIN_256_NOMAX-NEXT: ld1b { z0.b }, p0/z, [x8] +; SVE2_MIN_256_NOMAX-NEXT: tbl z0.b, { z1.b }, z0.b +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> @@ -109,6 +169,70 @@ define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) { ; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] ; SVE2_128_NOMAX-NEXT: add sp, sp, #16 ; SVE2_128_NOMAX-NEXT: ret +; +; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops: +; SVE2_NOMIN_NOMAX: // %bb.0: +; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16 +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[7] +; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[6] +; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[4] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 +; SVE2_NOMIN_NOMAX-NEXT: ldr d1, [x0] +; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 +; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #15] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3 +; SVE2_NOMIN_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #14] +; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1] +; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #13] +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #12] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s3 +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #11] +; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s0 +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #10] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #9] +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #8] +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8] +; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: ret +; +; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops: +; SVE2_MIN_256_NOMAX: // %bb.0: +; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16 +; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[7] +; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[6] +; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[4] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 +; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x0] +; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 +; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #15] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3 +; SVE2_MIN_256_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #14] +; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1] +; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #13] +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #12] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s3 +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #11] +; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s0 +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #10] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #9] +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #8] +; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8] +; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> @@ -165,6 +289,64 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) { ; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] ; SVE2_128_NOMAX-NEXT: add sp, sp, #16 ; SVE2_128_NOMAX-NEXT: ret +; +; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_poison_value: +; SVE2_NOMIN_NOMAX: // %bb.0: +; SVE2_NOMIN_NOMAX-NEXT: sub sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: .cfi_def_cfa_offset 16 +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] +; SVE2_NOMIN_NOMAX-NEXT: ldr d3, [x0] +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[6] +; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[4] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z0.b[3] +; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 +; SVE2_NOMIN_NOMAX-NEXT: mov z2.b, z0.b[2] +; SVE2_NOMIN_NOMAX-NEXT: mov z0.b, z0.b[1] +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #14] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s1 +; SVE2_NOMIN_NOMAX-NEXT: mov z1.b, z3.b[1] +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #13] +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #12] +; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s2 +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #11] +; SVE2_NOMIN_NOMAX-NEXT: fmov w8, s0 +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #10] +; SVE2_NOMIN_NOMAX-NEXT: fmov w9, s1 +; SVE2_NOMIN_NOMAX-NEXT: strb w8, [sp, #9] +; SVE2_NOMIN_NOMAX-NEXT: strb w9, [sp, #8] +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [sp, #8] +; SVE2_NOMIN_NOMAX-NEXT: add sp, sp, #16 +; SVE2_NOMIN_NOMAX-NEXT: ret +; +; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_poison_value: +; SVE2_MIN_256_NOMAX: // %bb.0: +; SVE2_MIN_256_NOMAX-NEXT: sub sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: .cfi_def_cfa_offset 16 +; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [x1] +; SVE2_MIN_256_NOMAX-NEXT: ldr d3, [x0] +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[6] +; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[4] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z0.b[3] +; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 +; SVE2_MIN_256_NOMAX-NEXT: mov z2.b, z0.b[2] +; SVE2_MIN_256_NOMAX-NEXT: mov z0.b, z0.b[1] +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #14] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s1 +; SVE2_MIN_256_NOMAX-NEXT: mov z1.b, z3.b[1] +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #13] +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #12] +; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s2 +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #11] +; SVE2_MIN_256_NOMAX-NEXT: fmov w8, s0 +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #10] +; SVE2_MIN_256_NOMAX-NEXT: fmov w9, s1 +; SVE2_MIN_256_NOMAX-NEXT: strb w8, [sp, #9] +; SVE2_MIN_256_NOMAX-NEXT: strb w9, [sp, #8] +; SVE2_MIN_256_NOMAX-NEXT: ldr d0, [sp, #8] +; SVE2_MIN_256_NOMAX-NEXT: add sp, sp, #16 +; SVE2_MIN_256_NOMAX-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> @@ -172,14 +354,43 @@ define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) { } define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) { -; CHECK-LABEL: shuffle_op1_poison: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; SVE2_128-LABEL: shuffle_op1_poison: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI4_0 +; SVE2_128-NEXT: ldr d0, [x1] +; SVE2_128-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_128_NOMAX-LABEL: shuffle_op1_poison: +; SVE2_128_NOMAX: // %bb.0: +; SVE2_128_NOMAX-NEXT: adrp x8, .LCPI4_0 +; SVE2_128_NOMAX-NEXT: ldr d0, [x1] +; SVE2_128_NOMAX-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; SVE2_128_NOMAX-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_128_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128_NOMAX-NEXT: ret +; +; SVE2_NOMIN_NOMAX-LABEL: shuffle_op1_poison: +; SVE2_NOMIN_NOMAX: // %bb.0: +; SVE2_NOMIN_NOMAX-NEXT: adrp x8, .LCPI4_0 +; SVE2_NOMIN_NOMAX-NEXT: ldr d0, [x1] +; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; SVE2_NOMIN_NOMAX-NEXT: tbl z0.b, { z0.b }, z1.b +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_NOMIN_NOMAX-NEXT: ret +; +; SVE2_MIN_256_NOMAX-LABEL: shuffle_op1_poison: +; SVE2_MIN_256_NOMAX: // %bb.0: +; SVE2_MIN_256_NOMAX-NEXT: ptrue p0.b, vl32 +; SVE2_MIN_256_NOMAX-NEXT: adrp x8, .LCPI4_0 +; SVE2_MIN_256_NOMAX-NEXT: add x8, x8, :lo12:.LCPI4_0 +; SVE2_MIN_256_NOMAX-NEXT: ldr d1, [x1] +; SVE2_MIN_256_NOMAX-NEXT: ld1b { z0.b }, p0/z, [x8] +; SVE2_MIN_256_NOMAX-NEXT: tbl z0.b, { z1.b }, z0.b +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_MIN_256_NOMAX-NEXT: ret %op2 = load <8 x i8>, ptr %b %1 = shufflevector <8 x i8> poison, <8 x i8> %op2, <8 x i32> ret <8 x i8> %1 @@ -252,3 +463,151 @@ define <8 x i8> @shuffle_index_size_op1_maxhw(ptr %a, ptr %b) "target-features"= %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> ret <8 x i8> %1 } + +; SVE2_128: .LCPI7_0: +; SVE2_128-NEXT: .hword 1 // 0x1 +; SVE2_128-NEXT: .hword 9 // 0x9 +; SVE2_128-NEXT: .hword 10 // 0xa +; SVE2_128-NEXT: .hword 11 // 0xb +; SVE2_128-NEXT: .hword 12 // 0xc +; SVE2_128-NEXT: .hword 12 // 0xc +; SVE2_128-NEXT: .hword 14 // 0xe +; SVE2_128-NEXT: .hword 15 // 0xf + +; SVE2_128_NOMAX: .LCPI7_0: +; SVE2_128_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT:.LCPI7_1: +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_128_NOMAX-NEXT: .hword 2 // 0x2 +; SVE2_128_NOMAX-NEXT: .hword 3 // 0x3 +; SVE2_128_NOMAX-NEXT: .hword 4 // 0x4 +; SVE2_128_NOMAX-NEXT: .hword 4 // 0x4 +; SVE2_128_NOMAX-NEXT: .hword 6 // 0x6 +; SVE2_128_NOMAX-NEXT: .hword 7 // 0x7 + +; SVE2_NOMIN_NOMAX: .LCPI7_0: +; SVE2_NOMIN_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT:.LCPI7_1: +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_NOMIN_NOMAX-NEXT: .hword 2 // 0x2 +; SVE2_NOMIN_NOMAX-NEXT: .hword 3 // 0x3 +; SVE2_NOMIN_NOMAX-NEXT: .hword 4 // 0x4 +; SVE2_NOMIN_NOMAX-NEXT: .hword 4 // 0x4 +; SVE2_NOMIN_NOMAX-NEXT: .hword 6 // 0x6 +; SVE2_NOMIN_NOMAX-NEXT: .hword 7 // 0x7 + +; SVE2_MIN_256_NOMAX: .LCPI7_0: +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT: .hword 0 // 0x0 +; SVE2_MIN_256_NOMAX-NEXT:.LCPI7_1: +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 1 // 0x1 +; SVE2_MIN_256_NOMAX-NEXT: .hword 2 // 0x2 +; SVE2_MIN_256_NOMAX-NEXT: .hword 3 // 0x3 +; SVE2_MIN_256_NOMAX-NEXT: .hword 4 // 0x4 +; SVE2_MIN_256_NOMAX-NEXT: .hword 4 // 0x4 +; SVE2_MIN_256_NOMAX-NEXT: .hword 6 // 0x6 +; SVE2_MIN_256_NOMAX-NEXT: .hword 7 // 0x7 +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +; SVE2_MIN_256_NOMAX-NEXT: .hword 65535 // 0xffff +define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { +; SVE2_128-LABEL: shuffle_index_indices_from_both_ops_i16: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI7_0 +; SVE2_128-NEXT: ldr q0, [x0] +; SVE2_128-NEXT: ldr q1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] +; SVE2_128-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h +; SVE2_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: +; SVE2_128_NOMAX: // %bb.0: +; SVE2_128_NOMAX-NEXT: ptrue p0.h, vl8 +; SVE2_128_NOMAX-NEXT: cnth x8 +; SVE2_128_NOMAX-NEXT: adrp x9, .LCPI7_0 +; SVE2_128_NOMAX-NEXT: adrp x10, .LCPI7_1 +; SVE2_128_NOMAX-NEXT: mov z0.h, w8 +; SVE2_128_NOMAX-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] +; SVE2_128_NOMAX-NEXT: ldr q2, [x10, :lo12:.LCPI7_1] +; SVE2_128_NOMAX-NEXT: mad z0.h, p0/m, z1.h, z2.h +; SVE2_128_NOMAX-NEXT: ldr q1, [x0] +; SVE2_128_NOMAX-NEXT: ldr q2, [x1] +; SVE2_128_NOMAX-NEXT: tbl z0.h, { z1.h, z2.h }, z0.h +; SVE2_128_NOMAX-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_128_NOMAX-NEXT: ret +; +; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: +; SVE2_NOMIN_NOMAX: // %bb.0: +; SVE2_NOMIN_NOMAX-NEXT: ptrue p0.h, vl8 +; SVE2_NOMIN_NOMAX-NEXT: cnth x8 +; SVE2_NOMIN_NOMAX-NEXT: adrp x9, .LCPI7_0 +; SVE2_NOMIN_NOMAX-NEXT: adrp x10, .LCPI7_1 +; SVE2_NOMIN_NOMAX-NEXT: mov z0.h, w8 +; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] +; SVE2_NOMIN_NOMAX-NEXT: ldr q2, [x10, :lo12:.LCPI7_1] +; SVE2_NOMIN_NOMAX-NEXT: mad z0.h, p0/m, z1.h, z2.h +; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x0] +; SVE2_NOMIN_NOMAX-NEXT: ldr q2, [x1] +; SVE2_NOMIN_NOMAX-NEXT: tbl z0.h, { z1.h, z2.h }, z0.h +; SVE2_NOMIN_NOMAX-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_NOMIN_NOMAX-NEXT: ret +; +; SVE2_MIN_256_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: +; SVE2_MIN_256_NOMAX: // %bb.0: +; SVE2_MIN_256_NOMAX-NEXT: ptrue p0.h, vl16 +; SVE2_MIN_256_NOMAX-NEXT: adrp x8, .LCPI7_0 +; SVE2_MIN_256_NOMAX-NEXT: add x8, x8, :lo12:.LCPI7_0 +; SVE2_MIN_256_NOMAX-NEXT: adrp x9, .LCPI7_1 +; SVE2_MIN_256_NOMAX-NEXT: add x9, x9, :lo12:.LCPI7_1 +; SVE2_MIN_256_NOMAX-NEXT: cnth x10 +; SVE2_MIN_256_NOMAX-NEXT: mov z2.h, w10 +; SVE2_MIN_256_NOMAX-NEXT: ld1h { z0.h }, p0/z, [x8] +; SVE2_MIN_256_NOMAX-NEXT: ld1h { z1.h }, p0/z, [x9] +; SVE2_MIN_256_NOMAX-NEXT: mad z0.h, p0/m, z2.h, z1.h +; SVE2_MIN_256_NOMAX-NEXT: ldr q1, [x0] +; SVE2_MIN_256_NOMAX-NEXT: ldr q2, [x1] +; SVE2_MIN_256_NOMAX-NEXT: tbl z0.h, { z1.h, z2.h }, z0.h +; SVE2_MIN_256_NOMAX-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2_MIN_256_NOMAX-NEXT: ret + %op1 = load <8 x i16>, ptr %a + %op2 = load <8 x i16>, ptr %b + %1 = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> + ret <8 x i16> %1 +} From 73185854a3fc469b7d3e21d0b5d2ecb5ee15d201 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 21 Feb 2024 19:02:20 +0400 Subject: [PATCH 106/351] [clang] Implement CWG1719 "Layout compatibility and cv-qualification revisited" (#82358) This patch updates our internal notion of `layout-compatible` to ignore cv-qualification, which in turn fixes `__is_layout_compatible` intrinsic. --- clang/docs/ReleaseNotes.rst | 9 ++++++--- clang/lib/Sema/SemaChecking.cpp | 13 +++++++------ clang/test/CXX/drs/dr17xx.cpp | 12 ++++++------ clang/test/SemaCXX/type-traits.cpp | 26 ++++++++++++++++++-------- clang/www/cxx_dr_status.html | 4 ++-- 5 files changed, 39 insertions(+), 25 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c17298bc7bce5..15905e0895509 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -98,9 +98,8 @@ C++20 Feature Support - Implemented the `__is_layout_compatible` intrinsic to support `P0466R5: Layout-compatibility and Pointer-interconvertibility Traits `_. - Note: `CWG1719: Layout compatibility and cv-qualification revisited `_ - and `CWG2759: [[no_unique_address] and common initial sequence `_ - are not yet implemented. + Note: `CWG2759: [[no_unique_address] and common initial sequence `_ + is not yet implemented. C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ @@ -120,6 +119,10 @@ Resolutions to C++ Defect Reports in the template parameters, but is deduced from a previous argument. (`#78449: `_). +- Type qualifications are now ignored when evaluating layout compatibility + of two types. + (`CWG1719: Layout compatibility and cv-qualification revisited `_). + C Language Changes ------------------ diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index d951c0fc2732d..e8bfb215a5b4c 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -19124,15 +19124,16 @@ static bool isLayoutCompatible(ASTContext &C, QualType T1, QualType T2) { if (T1.isNull() || T2.isNull()) return false; - // C++11 [basic.types] p11: - // If two types T1 and T2 are the same type, then T1 and T2 are - // layout-compatible types. - if (C.hasSameType(T1, T2)) - return true; - + // C++20 [basic.types] p11: + // Two types cv1 T1 and cv2 T2 are layout-compatible types + // if T1 and T2 are the same type, layout-compatible enumerations (9.7.1), + // or layout-compatible standard-layout class types (11.4). T1 = T1.getCanonicalType().getUnqualifiedType(); T2 = T2.getCanonicalType().getUnqualifiedType(); + if (C.hasSameType(T1, T2)) + return true; + const Type::TypeClass TC1 = T1->getTypeClass(); const Type::TypeClass TC2 = T2->getTypeClass(); diff --git a/clang/test/CXX/drs/dr17xx.cpp b/clang/test/CXX/drs/dr17xx.cpp index e5cee19337ebd..d3cb5e58f06b3 100644 --- a/clang/test/CXX/drs/dr17xx.cpp +++ b/clang/test/CXX/drs/dr17xx.cpp @@ -46,7 +46,7 @@ namespace dr1715 { // dr1715: 3.9 #endif } -namespace dr1719 { // dr1719: no +namespace dr1719 { // dr1719: 19 #if __cplusplus >= 201103L struct CStruct { int one; @@ -66,11 +66,11 @@ struct CStructWithQualifiers { static_assert(__is_layout_compatible(CStruct, const CStruct2), ""); static_assert(__is_layout_compatible(CStruct, volatile CStruct2), ""); static_assert(__is_layout_compatible(const CStruct, volatile CStruct2), ""); -// FIXME: all of the following pairs of types are layout-compatible -static_assert(!__is_layout_compatible(int, const int), ""); -static_assert(!__is_layout_compatible(int, volatile int), ""); -static_assert(!__is_layout_compatible(const int, volatile int), ""); -static_assert(!__is_layout_compatible(CStruct, CStructWithQualifiers), ""); +static_assert(__is_layout_compatible(int, const int), ""); +static_assert(__is_layout_compatible(int, volatile int), ""); +static_assert(__is_layout_compatible(const int, volatile int), ""); +static_assert(__is_layout_compatible(CStruct, CStructWithQualifiers), ""); +static_assert(__is_layout_compatible(int[], const volatile int[]), ""); #endif } // namespace dr1719 diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index 6ff04b6c8c722..2c35d5ee19a4c 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -1609,7 +1609,12 @@ struct CStructNoUniqueAddress2 { [[no_unique_address]] int two; }; -struct CStructAlignment { +struct alignas(64) CStructAlignment { + int one; + int two; +}; + +struct CStructAlignedMembers { int one; alignas(16) int two; }; @@ -1711,13 +1716,17 @@ void is_layout_compatible(int n) { static_assert(__is_layout_compatible(void, void), ""); static_assert(!__is_layout_compatible(void, int), ""); - static_assert(!__is_layout_compatible(void, const void), ""); // FIXME: this is CWG1719 - static_assert(!__is_layout_compatible(void, volatile void), ""); // FIXME: this is CWG1719 - static_assert(!__is_layout_compatible(const int, volatile int), ""); // FIXME: this is CWG1719 + static_assert(__is_layout_compatible(void, const void), ""); + static_assert(__is_layout_compatible(void, volatile void), ""); + static_assert(__is_layout_compatible(const int, volatile int), ""); static_assert(__is_layout_compatible(int, int), ""); - static_assert(!__is_layout_compatible(int, const int), ""); // FIXME: this is CWG1719 - static_assert(!__is_layout_compatible(int, volatile int), ""); // FIXME: this is CWG1719 - static_assert(!__is_layout_compatible(const int, volatile int), ""); // FIXME: this is CWG1719 + static_assert(__is_layout_compatible(int, const int), ""); + static_assert(__is_layout_compatible(int, volatile int), ""); + static_assert(__is_layout_compatible(const int, volatile int), ""); + static_assert(__is_layout_compatible(int *, int * __restrict), ""); + // Note: atomic qualification matters for layout compatibility. + static_assert(!__is_layout_compatible(int, _Atomic int), ""); + static_assert(__is_layout_compatible(_Atomic(int), _Atomic int), ""); static_assert(!__is_layout_compatible(int, unsigned int), ""); static_assert(!__is_layout_compatible(char, unsigned char), ""); static_assert(!__is_layout_compatible(char, signed char), ""); @@ -1758,10 +1767,11 @@ void is_layout_compatible(int n) static_assert(!__is_layout_compatible(CppStructNonStandardByVirtBase, CppStructNonStandardByVirtBase2), ""); static_assert(!__is_layout_compatible(CppStructNonStandardBySameBase, CppStructNonStandardBySameBase2), ""); static_assert(!__is_layout_compatible(CppStructNonStandardBy2ndVirtBase, CppStructNonStandardBy2ndVirtBase2), ""); - static_assert(!__is_layout_compatible(CStruct, CStructWithQualifiers), ""); // FIXME: this is CWG1719 + static_assert(__is_layout_compatible(CStruct, CStructWithQualifiers), ""); static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) == bool(__has_cpp_attribute(no_unique_address)), ""); // FIXME: this is CWG2759 static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) == bool(__has_cpp_attribute(no_unique_address)), ""); // FIXME: this is CWG2759 static_assert(__is_layout_compatible(CStruct, CStructAlignment), ""); + static_assert(__is_layout_compatible(CStruct, CStructAlignedMembers), ""); // FIXME: alignment of members impact common initial sequence static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds), ""); static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds2), ""); static_assert(!__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds3), ""); diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index e9b18b1e283e6..38e2cb6314266 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -7812,7 +7812,7 @@

C++ defect report implementation status

1334 NAD Layout compatibility and cv-qualification - Unknown + Superseded by 1719 1335 @@ -10122,7 +10122,7 @@

C++ defect report implementation status

1719 CD4 Layout compatibility and cv-qualification revisited - Unknown + Clang 19 1720 From 9c0e45d7f0e2202e16dbd9a7b9f462e2bcb741ae Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 21 Feb 2024 16:26:16 +0100 Subject: [PATCH 107/351] [SystemZ] Use VT (not ArgVT) for SlotVT in LowerCall(). (#82475) When an integer argument is promoted and *not* split (like i72 -> i128 on a new machine with vector support), the SlotVT should be i128, which is stored in VT - not ArgVT. Fixes #81417 --- .../lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +- llvm/test/CodeGen/SystemZ/frame-29.ll | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/SystemZ/frame-29.ll diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index d92586f7d05d0..3b85a6ac0371e 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1923,7 +1923,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned N = getNumRegistersForCallingConv(Ctx, CLI.CallConv, OrigArgVT); SlotVT = EVT::getIntegerVT(Ctx, PartVT.getSizeInBits() * N); } else { - SlotVT = Outs[I].ArgVT; + SlotVT = Outs[I].VT; } SDValue SpillSlot = DAG.CreateStackTemporary(SlotVT); int FI = cast(SpillSlot)->getIndex(); diff --git a/llvm/test/CodeGen/SystemZ/frame-29.ll b/llvm/test/CodeGen/SystemZ/frame-29.ll new file mode 100644 index 0000000000000..6cc0d9e985e16 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/frame-29.ll @@ -0,0 +1,18 @@ +; RUN: llc %s -o - -mtriple=s390x-linux-gnu -mcpu=z16 -print-after=finalize-isel 2>&1 | FileCheck %s +; +; Test that the correct space is allocated for the outgoing stack argument. + +declare void @bar(i72 %Arg); + +define void @foo() { +; CHECK-LABEL: # Machine code for function foo: IsSSA, TracksLiveness +; CHECK-NEXT: Frame Objects: +; CHECK-NEXT: fi#0: size=1, align=2, at location [SP] +; CHECK-NEXT: fi#1: size=16, align=8, at location [SP] + +; CHECK-LABEL: foo: +; CHECK: aghi %r15, -184 + %1 = alloca i8, align 2 + tail call fastcc void @bar(i72 2097168) + ret void +} From e214f004cb9e17847262d8fe64926a9cad6d2e86 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 21 Feb 2024 16:34:00 +0100 Subject: [PATCH 108/351] [mlir][Transforms][NFC] Turn in-place op modification into `IRRewrite` (#81245) This commit simplifies the internal state of the dialect conversion. A separate field for the previous state of in-place op modifications is no longer needed. --- .../mlir/Transforms/DialectConversion.h | 4 +- .../Transforms/Utils/DialectConversion.cpp | 146 +++++++++--------- 2 files changed, 74 insertions(+), 76 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 15fa39bde104b..0d7722aa07ee3 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -744,8 +744,8 @@ class ConversionPatternRewriter final : public PatternRewriter { /// PatternRewriter hook for updating the given operation in-place. /// Note: These methods only track updates to the given operation itself, - /// and not nested regions. Updates to regions will still require - /// notification through other more specific hooks above. + /// and not nested regions. Updates to regions will still require notification + /// through other more specific hooks above. void startOpModification(Operation *op) override; /// PatternRewriter hook for updating the given operation in-place. diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index c58b856faefb6..84e7232d326a8 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -154,14 +154,12 @@ namespace { struct RewriterState { RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations, unsigned numReplacements, unsigned numArgReplacements, - unsigned numRewrites, unsigned numIgnoredOperations, - unsigned numRootUpdates) + unsigned numRewrites, unsigned numIgnoredOperations) : numCreatedOps(numCreatedOps), numUnresolvedMaterializations(numUnresolvedMaterializations), numReplacements(numReplacements), numArgReplacements(numArgReplacements), numRewrites(numRewrites), - numIgnoredOperations(numIgnoredOperations), - numRootUpdates(numRootUpdates) {} + numIgnoredOperations(numIgnoredOperations) {} /// The current number of created operations. unsigned numCreatedOps; @@ -180,44 +178,6 @@ struct RewriterState { /// The current number of ignored operations. unsigned numIgnoredOperations; - - /// The current number of operations that were updated in place. - unsigned numRootUpdates; -}; - -//===----------------------------------------------------------------------===// -// OperationTransactionState - -/// The state of an operation that was updated by a pattern in-place. This -/// contains all of the necessary information to reconstruct an operation that -/// was updated in place. -class OperationTransactionState { -public: - OperationTransactionState() = default; - OperationTransactionState(Operation *op) - : op(op), loc(op->getLoc()), attrs(op->getAttrDictionary()), - operands(op->operand_begin(), op->operand_end()), - successors(op->successor_begin(), op->successor_end()) {} - - /// Discard the transaction state and reset the state of the original - /// operation. - void resetOperation() const { - op->setLoc(loc); - op->setAttrs(attrs); - op->setOperands(operands); - for (const auto &it : llvm::enumerate(successors)) - op->setSuccessor(it.value(), it.index()); - } - - /// Return the original operation of this state. - Operation *getOperation() const { return op; } - -private: - Operation *op; - LocationAttr loc; - DictionaryAttr attrs; - SmallVector operands; - SmallVector successors; }; //===----------------------------------------------------------------------===// @@ -754,14 +714,19 @@ namespace { class IRRewrite { public: /// The kind of the rewrite. Rewrites can be undone if the conversion fails. + /// Enum values are ordered, so that they can be used in `classof`: first all + /// block rewrites, then all operation rewrites. enum class Kind { + // Block rewrites CreateBlock, EraseBlock, InlineBlock, MoveBlock, SplitBlock, BlockTypeConversion, - MoveOperation + // Operation rewrites + MoveOperation, + ModifyOperation }; virtual ~IRRewrite() = default; @@ -992,7 +957,7 @@ class OperationRewrite : public IRRewrite { static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() >= Kind::MoveOperation && - rewrite->getKind() <= Kind::MoveOperation; + rewrite->getKind() <= Kind::ModifyOperation; } protected: @@ -1031,8 +996,48 @@ class MoveOperationRewrite : public OperationRewrite { // this operation was the only operation in the region. Operation *insertBeforeOp; }; + +/// In-place modification of an op. This rewrite is immediately reflected in +/// the IR. The previous state of the operation is stored in this object. +class ModifyOperationRewrite : public OperationRewrite { +public: + ModifyOperationRewrite(ConversionPatternRewriterImpl &rewriterImpl, + Operation *op) + : OperationRewrite(Kind::ModifyOperation, rewriterImpl, op), + loc(op->getLoc()), attrs(op->getAttrDictionary()), + operands(op->operand_begin(), op->operand_end()), + successors(op->successor_begin(), op->successor_end()) {} + + static bool classof(const IRRewrite *rewrite) { + return rewrite->getKind() == Kind::ModifyOperation; + } + + void rollback() override { + op->setLoc(loc); + op->setAttrs(attrs); + op->setOperands(operands); + for (const auto &it : llvm::enumerate(successors)) + op->setSuccessor(it.value(), it.index()); + } + +private: + LocationAttr loc; + DictionaryAttr attrs; + SmallVector operands; + SmallVector successors; +}; } // namespace +/// Return "true" if there is an operation rewrite that matches the specified +/// rewrite type and operation among the given rewrites. +template +static bool hasRewrite(R &&rewrites, Operation *op) { + return any_of(std::move(rewrites), [&](auto &rewrite) { + auto *rewriteTy = dyn_cast(rewrite.get()); + return rewriteTy && rewriteTy->getOperation() == op; + }); +} + //===----------------------------------------------------------------------===// // ConversionPatternRewriterImpl //===----------------------------------------------------------------------===// @@ -1184,9 +1189,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// operation was ignored. SetVector ignoredOps; - /// A transaction state for each of operations that were updated in-place. - SmallVector rootUpdates; - /// A vector of indices into `replacements` of operations that were replaced /// with values with different result types than the original operation, e.g. /// 1->N conversion of some kind. @@ -1238,10 +1240,6 @@ static void detachNestedAndErase(Operation *op) { } void ConversionPatternRewriterImpl::discardRewrites() { - // Reset any operations that were updated in place. - for (auto &state : rootUpdates) - state.resetOperation(); - undoRewrites(); // Remove any newly created ops. @@ -1316,15 +1314,10 @@ void ConversionPatternRewriterImpl::applyRewrites() { RewriterState ConversionPatternRewriterImpl::getCurrentState() { return RewriterState(createdOps.size(), unresolvedMaterializations.size(), replacements.size(), argReplacements.size(), - rewrites.size(), ignoredOps.size(), rootUpdates.size()); + rewrites.size(), ignoredOps.size()); } void ConversionPatternRewriterImpl::resetState(RewriterState state) { - // Reset any operations that were updated in place. - for (unsigned i = state.numRootUpdates, e = rootUpdates.size(); i != e; ++i) - rootUpdates[i].resetOperation(); - rootUpdates.resize(state.numRootUpdates); - // Reset any replaced arguments. for (BlockArgument replacedArg : llvm::drop_begin(argReplacements, state.numArgReplacements)) @@ -1750,7 +1743,7 @@ void ConversionPatternRewriter::startOpModification(Operation *op) { #ifndef NDEBUG impl->pendingRootUpdates.insert(op); #endif - impl->rootUpdates.emplace_back(op); + impl->appendRewrite(op); } void ConversionPatternRewriter::finalizeOpModification(Operation *op) { @@ -1769,13 +1762,15 @@ void ConversionPatternRewriter::cancelOpModification(Operation *op) { "operation did not have a pending in-place update"); #endif // Erase the last update for this operation. - auto stateHasOp = [op](const auto &it) { return it.getOperation() == op; }; - auto &rootUpdates = impl->rootUpdates; - auto it = llvm::find_if(llvm::reverse(rootUpdates), stateHasOp); - assert(it != rootUpdates.rend() && "no root update started on op"); - (*it).resetOperation(); - int updateIdx = std::prev(rootUpdates.rend()) - it; - rootUpdates.erase(rootUpdates.begin() + updateIdx); + auto it = llvm::find_if( + llvm::reverse(impl->rewrites), [&](std::unique_ptr &rewrite) { + auto *modifyRewrite = dyn_cast(rewrite.get()); + return modifyRewrite && modifyRewrite->getOperation() == op; + }); + assert(it != impl->rewrites.rend() && "no root update started on op"); + (*it)->rollback(); + int updateIdx = std::prev(impl->rewrites.rend()) - it; + impl->rewrites.erase(impl->rewrites.begin() + updateIdx); } detail::ConversionPatternRewriterImpl &ConversionPatternRewriter::getImpl() { @@ -2059,6 +2054,7 @@ OperationLegalizer::legalizeWithPattern(Operation *op, // Functor that cleans up the rewriter state after a pattern failed to match. RewriterState curState = rewriterImpl.getCurrentState(); auto onFailure = [&](const Pattern &pattern) { + assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates"); LLVM_DEBUG({ logFailure(rewriterImpl.logger, "pattern failed to match"); if (rewriterImpl.notifyCallback) { @@ -2076,6 +2072,7 @@ OperationLegalizer::legalizeWithPattern(Operation *op, // Functor that performs additional legalization when a pattern is // successfully applied. auto onSuccess = [&](const Pattern &pattern) { + assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates"); auto result = legalizePatternResult(op, pattern, rewriter, curState); appliedPatterns.erase(&pattern); if (failed(result)) @@ -2118,7 +2115,6 @@ OperationLegalizer::legalizePatternResult(Operation *op, const Pattern &pattern, #ifndef NDEBUG assert(impl.pendingRootUpdates.empty() && "dangling root updates"); -#endif // Check that the root was either replaced or updated in place. auto replacedRoot = [&] { @@ -2127,14 +2123,12 @@ OperationLegalizer::legalizePatternResult(Operation *op, const Pattern &pattern, [op](auto &it) { return it.first == op; }); }; auto updatedRootInPlace = [&] { - return llvm::any_of( - llvm::drop_begin(impl.rootUpdates, curState.numRootUpdates), - [op](auto &state) { return state.getOperation() == op; }); + return hasRewrite( + llvm::drop_begin(impl.rewrites, curState.numRewrites), op); }; - (void)replacedRoot; - (void)updatedRootInPlace; assert((replacedRoot() || updatedRootInPlace()) && "expected pattern to replace the root operation"); +#endif // NDEBUG // Legalize each of the actions registered during application. RewriterState newState = impl.getCurrentState(); @@ -2221,8 +2215,11 @@ LogicalResult OperationLegalizer::legalizePatternCreatedOperations( LogicalResult OperationLegalizer::legalizePatternRootUpdates( ConversionPatternRewriter &rewriter, ConversionPatternRewriterImpl &impl, RewriterState &state, RewriterState &newState) { - for (int i = state.numRootUpdates, e = newState.numRootUpdates; i != e; ++i) { - Operation *op = impl.rootUpdates[i].getOperation(); + for (int i = state.numRewrites, e = newState.numRewrites; i != e; ++i) { + auto *rewrite = dyn_cast(impl.rewrites[i].get()); + if (!rewrite) + continue; + Operation *op = rewrite->getOperation(); if (failed(legalize(op, rewriter))) { LLVM_DEBUG(logFailure( impl.logger, "failed to legalize operation updated in-place '{0}'", @@ -3562,7 +3559,8 @@ mlir::applyPartialConversion(Operation *op, const ConversionTarget &target, // Full Conversion LogicalResult -mlir::applyFullConversion(ArrayRef ops, const ConversionTarget &target, +mlir::applyFullConversion(ArrayRef ops, + const ConversionTarget &target, const FrozenRewritePatternSet &patterns) { OperationConverter opConverter(target, patterns, OpConversionMode::Full); return opConverter.convertOperations(ops); From 3a70335bae25b9df39e20d714d3ed1ab0fc6d20a Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 21 Feb 2024 16:41:45 +0100 Subject: [PATCH 109/351] [mlir][Transforms] Support rolling back properties in dialect conversion (#82474) The dialect conversion rolls back in-place op modifications upon failure. Rolling back modifications of attributes is already supported, but there was no support for properties until now. --- .../Transforms/Utils/DialectConversion.cpp | 31 ++++++++++++++++++- mlir/test/Transforms/test-legalizer.mlir | 12 +++++++ mlir/test/lib/Dialect/Test/TestPatterns.cpp | 18 ++++++++++- 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 84e7232d326a8..cc61bc6b6260c 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1006,18 +1006,46 @@ class ModifyOperationRewrite : public OperationRewrite { : OperationRewrite(Kind::ModifyOperation, rewriterImpl, op), loc(op->getLoc()), attrs(op->getAttrDictionary()), operands(op->operand_begin(), op->operand_end()), - successors(op->successor_begin(), op->successor_end()) {} + successors(op->successor_begin(), op->successor_end()) { + if (OpaqueProperties prop = op->getPropertiesStorage()) { + // Make a copy of the properties. + propertiesStorage = operator new(op->getPropertiesStorageSize()); + OpaqueProperties propCopy(propertiesStorage); + op->getName().initOpProperties(propCopy, /*init=*/prop); + } + } static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() == Kind::ModifyOperation; } + ~ModifyOperationRewrite() override { + assert(!propertiesStorage && + "rewrite was neither committed nor rolled back"); + } + + void commit() override { + if (propertiesStorage) { + OpaqueProperties propCopy(propertiesStorage); + op->getName().destroyOpProperties(propCopy); + operator delete(propertiesStorage); + propertiesStorage = nullptr; + } + } + void rollback() override { op->setLoc(loc); op->setAttrs(attrs); op->setOperands(operands); for (const auto &it : llvm::enumerate(successors)) op->setSuccessor(it.value(), it.index()); + if (propertiesStorage) { + OpaqueProperties propCopy(propertiesStorage); + op->copyProperties(propCopy); + op->getName().destroyOpProperties(propCopy); + operator delete(propertiesStorage); + propertiesStorage = nullptr; + } } private: @@ -1025,6 +1053,7 @@ class ModifyOperationRewrite : public OperationRewrite { DictionaryAttr attrs; SmallVector operands; SmallVector successors; + void *propertiesStorage = nullptr; }; } // namespace diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 84fcc18ab7d37..62d776cd7573e 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -334,3 +334,15 @@ func.func @test_move_op_before_rollback() { }) : () -> () "test.return"() : () -> () } + +// ----- + +// CHECK-LABEL: func @test_properties_rollback() +func.func @test_properties_rollback() { + // CHECK: test.with_properties <{a = 32 : i64, + // expected-remark @below{{op 'test.with_properties' is not legalizable}} + test.with_properties + <{a = 32 : i64, array = array, b = "foo"}> + {modify_inplace} + "test.return"() : () -> () +} diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 2102a4ffabf7b..108cfe8950ef6 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -807,6 +807,21 @@ struct TestUndoBlockErase : public ConversionPattern { } }; +/// A pattern that modifies a property in-place, but keeps the op illegal. +struct TestUndoPropertiesModification : public ConversionPattern { + TestUndoPropertiesModification(MLIRContext *ctx) + : ConversionPattern("test.with_properties", /*benefit=*/1, ctx) {} + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + if (!op->hasAttr("modify_inplace")) + return failure(); + rewriter.modifyOpInPlace( + op, [&]() { cast(op).getProperties().setA(42); }); + return success(); + } +}; + //===----------------------------------------------------------------------===// // Type-Conversion Rewrite Testing @@ -1086,7 +1101,8 @@ struct TestLegalizePatternDriver TestChangeProducerTypeF32ToInvalid, TestUpdateConsumerType, TestNonRootReplacement, TestBoundedRecursiveRewrite, TestNestedOpCreationUndoRewrite, TestReplaceEraseOp, - TestCreateUnregisteredOp, TestUndoMoveOpBefore>(&getContext()); + TestCreateUnregisteredOp, TestUndoMoveOpBefore, + TestUndoPropertiesModification>(&getContext()); patterns.add(&getContext(), converter); mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns, converter); From b49f155cb9144b208b1291b5f02630d588350e1a Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 21 Feb 2024 16:49:58 +0100 Subject: [PATCH 110/351] [mlir][Transforms][NFC] Simplify `ArgConverter` state (#81462) * When converting a block signature, `ArgConverter` creates a new block with the new signature and moves all operation from the old block to the new block. The new block is temporarily inserted into a region that is stored in `regionMapping`. The old block is not yet deleted, so that the conversion can be rolled back. `regionMapping` is not needed. Instead of moving the old block to a temporary region, it can just be unlinked. Block erasures are handles in the same way in the dialect conversion. * `regionToConverter` is a mapping from regions to type converter. That field is never accessed within `ArgConverter`. It should be stored in `ConversionPatternRewriterImpl` instead. * `convertedBlocks` is not needed. Old blocks are already stored in `ConvertedBlockInfo`. --- .../Transforms/Utils/DialectConversion.cpp | 79 ++++++------------- 1 file changed, 22 insertions(+), 57 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index cc61bc6b6260c..88709bb261874 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -343,23 +343,6 @@ struct ArgConverter { const TypeConverter *converter; }; - /// Return if the signature of the given block has already been converted. - bool hasBeenConverted(Block *block) const { - return conversionInfo.count(block) || convertedBlocks.count(block); - } - - /// Set the type converter to use for the given region. - void setConverter(Region *region, const TypeConverter *typeConverter) { - assert(typeConverter && "expected valid type converter"); - regionToConverter[region] = typeConverter; - } - - /// Return the type converter to use for the given region, or null if there - /// isn't one. - const TypeConverter *getConverter(Region *region) { - return regionToConverter.lookup(region); - } - //===--------------------------------------------------------------------===// // Rewrite Application //===--------------------------------------------------------------------===// @@ -409,24 +392,10 @@ struct ArgConverter { ConversionValueMapping &mapping, SmallVectorImpl &argReplacements); - /// Insert a new conversion into the cache. - void insertConversion(Block *newBlock, ConvertedBlockInfo &&info); - /// A collection of blocks that have had their arguments converted. This is a /// map from the new replacement block, back to the original block. llvm::MapVector conversionInfo; - /// The set of original blocks that were converted. - DenseSet convertedBlocks; - - /// A mapping from valid regions, to those containing the original blocks of a - /// conversion. - DenseMap> regionMapping; - - /// A mapping of regions to type converters that should be used when - /// converting the arguments of blocks within that region. - DenseMap regionToConverter; - /// The pattern rewriter to use when materializing conversions. PatternRewriter &rewriter; @@ -474,12 +443,12 @@ void ArgConverter::discardRewrites(Block *block) { block->getArgument(i).dropAllUses(); block->replaceAllUsesWith(origBlock); - // Move the operations back the original block and the delete the new block. + // Move the operations back the original block, move the original block back + // into its original location and the delete the new block. origBlock->getOperations().splice(origBlock->end(), block->getOperations()); - origBlock->moveBefore(block); + block->getParent()->getBlocks().insert(Region::iterator(block), origBlock); block->erase(); - convertedBlocks.erase(origBlock); conversionInfo.erase(it); } @@ -510,6 +479,9 @@ void ArgConverter::applyRewrites(ConversionValueMapping &mapping) { mapping.lookupOrDefault(castValue, origArg.getType())); } } + + delete origBlock; + blockInfo.origBlock = nullptr; } } @@ -572,9 +544,11 @@ FailureOr ArgConverter::convertSignature( Block *block, const TypeConverter *converter, ConversionValueMapping &mapping, SmallVectorImpl &argReplacements) { - // Check if the block was already converted. If the block is detached, - // conservatively assume it is going to be deleted. - if (hasBeenConverted(block) || !block->getParent()) + // Check if the block was already converted. + // * If the block is mapped in `conversionInfo`, it is a converted block. + // * If the block is detached, conservatively assume that it is going to be + // deleted; it is likely the old block (before it was converted). + if (conversionInfo.count(block) || !block->getParent()) return block; // If a converter wasn't provided, and the block wasn't already converted, // there is nothing we can do. @@ -603,6 +577,9 @@ Block *ArgConverter::applySignatureConversion( // signature. Block *newBlock = block->splitBlock(block->begin()); block->replaceAllUsesWith(newBlock); + // Unlink the block, but do not erase it yet, so that the change can be rolled + // back. + block->getParent()->getBlocks().remove(block); // Map all new arguments to the location of the argument they originate from. SmallVector newLocs(convertedTypes.size(), @@ -679,24 +656,8 @@ Block *ArgConverter::applySignatureConversion( ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg); } - // Remove the original block from the region and return the new one. - insertConversion(newBlock, std::move(info)); - return newBlock; -} - -void ArgConverter::insertConversion(Block *newBlock, - ConvertedBlockInfo &&info) { - // Get a region to insert the old block. - Region *region = newBlock->getParent(); - std::unique_ptr &mappedRegion = regionMapping[region]; - if (!mappedRegion) - mappedRegion = std::make_unique(region->getParentOp()); - - // Move the original block to the mapped region and emplace the conversion. - mappedRegion->getBlocks().splice(mappedRegion->end(), region->getBlocks(), - info.origBlock->getIterator()); - convertedBlocks.insert(info.origBlock); conversionInfo.insert({newBlock, std::move(info)}); + return newBlock; } //===----------------------------------------------------------------------===// @@ -1227,6 +1188,10 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// active. const TypeConverter *currentTypeConverter = nullptr; + /// A mapping of regions to type converters that should be used when + /// converting the arguments of blocks within that region. + DenseMap regionToConverter; + /// This allows the user to collect the match failure message. function_ref notifyCallback; @@ -1504,7 +1469,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( FailureOr ConversionPatternRewriterImpl::convertRegionTypes( Region *region, const TypeConverter &converter, TypeConverter::SignatureConversion *entryConversion) { - argConverter.setConverter(region, &converter); + regionToConverter[region] = &converter; if (region->empty()) return nullptr; @@ -1519,7 +1484,7 @@ FailureOr ConversionPatternRewriterImpl::convertRegionTypes( LogicalResult ConversionPatternRewriterImpl::convertNonEntryRegionTypes( Region *region, const TypeConverter &converter, ArrayRef blockConversions) { - argConverter.setConverter(region, &converter); + regionToConverter[region] = &converter; if (region->empty()) return success(); @@ -2195,7 +2160,7 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites( // If the region of the block has a type converter, try to convert the block // directly. - if (auto *converter = impl.argConverter.getConverter(block->getParent())) { + if (auto *converter = impl.regionToConverter.lookup(block->getParent())) { if (failed(impl.convertBlockSignature(block, converter))) { LLVM_DEBUG(logFailure(impl.logger, "failed to convert types of moved " "block")); From f037e709cad410b885cb22ebb22e7e7539d41fb0 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 21 Feb 2024 07:56:08 -0800 Subject: [PATCH 111/351] [RISCV][TTI] Cost a subvector extract at a register boundary with exact vlen (#82405) If we have exact vlen knowledge, we can figure out which indices correspond to register boundaries. Our lowering uses this knowledge to replace the vslidedown.vi with a sub-register extract. Our costs can reflect that as well. This is another piece split off https://github.com/llvm/llvm-project/pull/80164 --------- Co-authored-by: Luke Lau --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 16 ++ .../RISCV/shuffle-extract_subvector.ll | 229 ++++++++++++++++++ 2 files changed, 245 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index cf234f25bf9d8..f04968d82e86e 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -436,6 +436,22 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (Index == 0) return TTI::TCC_Free; + // If we're extracting a subvector of at most m1 size at a sub-register + // boundary - which unfortunately we need exact vlen to identify - this is + // a subregister extract at worst and thus won't require a vslidedown. + // TODO: Extend for aligned m2, m4 subvector extracts + // TODO: Extend for misalgined (but contained) extracts + // TODO: Extend for scalable subvector types + if (std::pair SubLT = getTypeLegalizationCost(SubTp); + SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) { + const unsigned MinVLen = ST->getRealMinVLen(); + const unsigned MaxVLen = ST->getRealMaxVLen(); + if (MinVLen == MaxVLen && + SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 && + SubLT.second.getSizeInBits() <= MinVLen) + return TTI::TCC_Free; + } + // Example sequence: // vsetivli zero, 4, e8, mf2, tu, ma (ignored) // vslidedown.vi v8, v9, 2 diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll index 793786318a0a1..3ac2b7e26650a 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=-1 | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 | FileCheck --check-prefix=VLEN128 %s ; Check that we don't crash querying costs when vectors are not enabled. ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv32 @@ -20,6 +21,19 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VLEN128-LABEL: 'test_vXf64' +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of -1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> @@ -46,6 +60,18 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VLEN128-LABEL: 'test_vXi64' +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> @@ -84,6 +110,31 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VLEN128-LABEL: 'test_vXi32' +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> undef, <2 x i32> %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> undef, <2 x i32> @@ -169,6 +220,65 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VLEN128-LABEL: 'test_vXi16' +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> @@ -348,6 +458,125 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; VLEN128-LABEL: 'test_vXi8' +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> +; VLEN128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> From 3ee8c93769cd094ea0748b4a446a475160c0f51f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Feb 2024 09:55:55 -0600 Subject: [PATCH 112/351] [Offload] Fix NVPTX global entry names Summary: This was missed, the NVPTX globals cannot use a `.`. --- llvm/lib/Frontend/Offloading/Utility.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp index a3d24bfcc5151..a0d9dfa9e2b55 100644 --- a/llvm/lib/Frontend/Offloading/Utility.cpp +++ b/llvm/lib/Frontend/Offloading/Utility.cpp @@ -70,7 +70,7 @@ void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, getOffloadingEntryInitializer(M, Addr, Name, Size, Flags, Data); StringRef Prefix = - Triple.isNVPTX() ? "$omp_offloading$entry." : ".omp_offloading.entry."; + Triple.isNVPTX() ? "$omp_offloading$entry$" : ".omp_offloading.entry."; auto *Entry = new GlobalVariable( M, getEntryTy(M), /*isConstant=*/true, GlobalValue::WeakAnyLinkage, EntryInitializer, From ffcdf47bc443b36754c36bd6e1a77b4163657a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 21 Feb 2024 16:38:01 +0100 Subject: [PATCH 113/351] [clang][Interp] Allow adding an offset to a function pointer Pretty sure this isn't doing anything, but it fixes a test and is generally the right thing to do. Fixing the behavior will come later. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 11 ++++---- clang/test/AST/Interp/pointer-addition.c | 32 ++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 clang/test/AST/Interp/pointer-addition.c diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index d11d05dd709d5..0b08309e4e6e0 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1403,12 +1403,11 @@ bool ByteCodeExprGen::VisitPointerCompoundAssignOperator( if (!LT || !RT) return false; - assert(*LT == PT_Ptr); if (!visit(LHS)) return false; - if (!this->emitLoadPtr(LHS)) + if (!this->emitLoad(*LT, LHS)) return false; if (!visit(RHS)) @@ -2828,7 +2827,7 @@ bool ByteCodeExprGen::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr) { + if (T == PT_Ptr || T == PT_FnPtr) { if (!this->emitIncPtr(E)) return false; @@ -2846,7 +2845,7 @@ bool ByteCodeExprGen::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr) { + if (T == PT_Ptr || T == PT_FnPtr) { if (!this->emitDecPtr(E)) return false; @@ -2864,7 +2863,7 @@ bool ByteCodeExprGen::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr) { + if (T == PT_Ptr || T == PT_FnPtr) { if (!this->emitLoadPtr(E)) return false; if (!this->emitConstUint8(1, E)) @@ -2903,7 +2902,7 @@ bool ByteCodeExprGen::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (T == PT_Ptr) { + if (T == PT_Ptr || T == PT_FnPtr) { if (!this->emitLoadPtr(E)) return false; if (!this->emitConstUint8(1, E)) diff --git a/clang/test/AST/Interp/pointer-addition.c b/clang/test/AST/Interp/pointer-addition.c new file mode 100644 index 0000000000000..80ab670e8bfac --- /dev/null +++ b/clang/test/AST/Interp/pointer-addition.c @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 %s -fsyntax-only -verify=gnu,expected -pedantic -Wextra -std=c11 -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 %s -fsyntax-only -triple i686-unknown-unknown -verify=gnu,expected -pedantic -Wextra -std=c11 -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 %s -fsyntax-only -triple x86_64-unknown-unknown -verify=gnu,expected -pedantic -Wextra -std=c11 -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 %s -fsyntax-only -verify -pedantic -Wextra -Wno-gnu -std=c11 -fexperimental-new-constant-interpreter + +typedef __INTPTR_TYPE__ intptr_t; +typedef struct S S; // expected-note 4 {{forward declaration of 'struct S'}} +extern _Atomic(S*) e; +void a(S* b, void* c) { + void (*fp)(int) = 0; + b++; // expected-error {{arithmetic on a pointer to an incomplete type}} + b += 1; // expected-error {{arithmetic on a pointer to an incomplete type}} + c++; // gnu-warning {{arithmetic on a pointer to void is a GNU extension}} + c += 1; // gnu-warning {{arithmetic on a pointer to void is a GNU extension}} + c--; // gnu-warning {{arithmetic on a pointer to void is a GNU extension}} + c -= 1; // gnu-warning {{arithmetic on a pointer to void is a GNU extension}} + (void) c[1]; // gnu-warning {{subscript of a pointer to void is a GNU extension}} + b = 1+b; // expected-error {{arithmetic on a pointer to an incomplete type}} + /* The next couple tests are only pedantic warnings in gcc */ + void (*d)(S*,void*) = a; + d += 1; // gnu-warning {{arithmetic on a pointer to the function type 'void (S *, void *)' (aka 'void (struct S *, void *)') is a GNU extension}} + d++; // gnu-warning {{arithmetic on a pointer to the function type 'void (S *, void *)' (aka 'void (struct S *, void *)') is a GNU extension}} + d--; // gnu-warning {{arithmetic on a pointer to the function type 'void (S *, void *)' (aka 'void (struct S *, void *)') is a GNU extension}} + d -= 1; // gnu-warning {{arithmetic on a pointer to the function type 'void (S *, void *)' (aka 'void (struct S *, void *)') is a GNU extension}} + (void)(1 + d); // gnu-warning {{arithmetic on a pointer to the function type 'void (S *, void *)' (aka 'void (struct S *, void *)') is a GNU extension}} + e++; // expected-error {{arithmetic on a pointer to an incomplete type}} + intptr_t i = (intptr_t)b; + char *f = (char*)0 + i; // gnu-warning {{arithmetic on a null pointer treated as a cast from integer to pointer is a GNU extension}} + // Cases that don't match the GNU inttoptr idiom get a different warning. + f = (char*)0 - i; // expected-warning {{performing pointer arithmetic on a null pointer has undefined behavior}} + int *g = (int*)0 + i; // expected-warning {{performing pointer arithmetic on a null pointer has undefined behavior}} +} From 4beb4d5c72880ec69ef36bdebaed06c90cc5309b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 15:35:35 +0000 Subject: [PATCH 114/351] [CostModel][X86] Add test coverage for icmp vs zero This is really to test for icmp vs constant - some icmp unsigned could fold to simpler comparisons, but costmodel analysis won't do this --- llvm/test/Analysis/CostModel/X86/icmp0.ll | 3057 +++++++++++++++++++++ 1 file changed, 3057 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/X86/icmp0.ll diff --git a/llvm/test/Analysis/CostModel/X86/icmp0.ll b/llvm/test/Analysis/CostModel/X86/icmp0.ll new file mode 100644 index 0000000000000..0d8a25207e781 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/icmp0.ll @@ -0,0 +1,3057 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+xop,+avx | FileCheck %s -check-prefixes=XOPAVX1 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mattr=+xop,+avx2 | FileCheck %s -check-prefixes=XOPAVX2 +; +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 +; RUN: opt < %s -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 + +define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_eq' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_eq' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_eq' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_eq' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_eq' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_eq' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_eq' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_eq' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_eq' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_eq' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_eq' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_eq' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp eq i8 %arg8, zeroinitializer + %V16I8 = icmp eq <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp eq <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp eq <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp eq <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp eq i16 %arg16, zeroinitializer + %V8I16 = icmp eq <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp eq <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp eq <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp eq <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp eq i32 %arg32, zeroinitializer + %V4I32 = icmp eq <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp eq <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp eq <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp eq <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp eq i64 %arg64, zeroinitializer + %V2I64 = icmp eq <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp eq <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp eq <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp eq <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ne' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_ne' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_ne' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ne' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_ne' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_ne' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_ne' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_ne' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_ne' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_ne' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_ne' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_ne' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp ne i8 %arg8, zeroinitializer + %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp ne i16 %arg16, zeroinitializer + %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp ne i32 %arg32, zeroinitializer + %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp ne i64 %arg64, zeroinitializer + %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_sge' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_sge' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_sge' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_sge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_sge' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_sge' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_sge' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_sge' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_sge' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_sge' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_sge' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_sge' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp sge i8 %arg8, zeroinitializer + %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp sge i16 %arg16, zeroinitializer + %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp sge i32 %arg32, zeroinitializer + %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp sge i64 %arg64, zeroinitializer + %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_uge' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_uge' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_uge' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_uge' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_uge' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_uge' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_uge' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_uge' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_uge' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_uge' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_uge' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_uge' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp uge i8 %arg8, zeroinitializer + %V16I8 = icmp uge <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp uge <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp uge <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp uge <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp uge i16 %arg16, zeroinitializer + %V8I16 = icmp uge <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp uge <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp uge i32 %arg32, zeroinitializer + %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp uge i64 %arg64, zeroinitializer + %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_sgt' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_sgt' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_sgt' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_sgt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_sgt' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_sgt' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_sgt' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_sgt' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_sgt' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_sgt' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_sgt' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_sgt' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp sgt i8 %arg8, zeroinitializer + %V16I8 = icmp sgt <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp sgt <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp sgt <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp sgt <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp sgt i16 %arg16, zeroinitializer + %V8I16 = icmp sgt <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp sgt <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp sgt <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp sgt <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp sgt i32 %arg32, zeroinitializer + %V4I32 = icmp sgt <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp sgt <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp sgt <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp sgt <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp sgt i64 %arg64, zeroinitializer + %V2I64 = icmp sgt <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp sgt <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp sgt <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp sgt <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ugt' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_ugt' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_ugt' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ugt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_ugt' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_ugt' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_ugt' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_ugt' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_ugt' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_ugt' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_ugt' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_ugt' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp ugt i8 %arg8, zeroinitializer + %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp ugt i16 %arg16, zeroinitializer + %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp ugt i32 %arg32, zeroinitializer + %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp ugt i64 %arg64, zeroinitializer + %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_sle' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_sle' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_sle' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_sle' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_sle' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_sle' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_sle' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_sle' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_sle' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_sle' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_sle' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_sle' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp sle i8 %arg8, zeroinitializer + %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp sle i16 %arg16, zeroinitializer + %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp sle i32 %arg32, zeroinitializer + %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp sle i64 %arg64, zeroinitializer + %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ule' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_ule' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_ule' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ule' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_ule' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_ule' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_ule' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_ule' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_ule' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_ule' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_ule' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_ule' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp ule i8 %arg8, zeroinitializer + %V16I8 = icmp ule <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp ule <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp ule <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp ule <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp ule i16 %arg16, zeroinitializer + %V8I16 = icmp ule <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp ule <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp ule i32 %arg32, zeroinitializer + %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp ule i64 %arg64, zeroinitializer + %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_slt' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_slt' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_slt' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_slt' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_slt' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_slt' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_slt' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_slt' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_slt' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_slt' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_slt' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_slt' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp slt i8 %arg8, zeroinitializer + %V16I8 = icmp slt <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp slt <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp slt <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp slt <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp slt i16 %arg16, zeroinitializer + %V8I16 = icmp slt <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp slt <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp slt <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp slt <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp slt i32 %arg32, zeroinitializer + %V4I32 = icmp slt <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp slt <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp slt <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp slt <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp slt i64 %arg64, zeroinitializer + %V2I64 = icmp slt <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp slt <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp slt <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp slt <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + +define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { +; SSE2-LABEL: 'cmp_int_ult' +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE3-LABEL: 'cmp_int_ult' +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'cmp_int_ult' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE41-LABEL: 'cmp_int_ult' +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'cmp_int_ult' +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'cmp_int_ult' +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'cmp_int_ult' +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'cmp_int_ult' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'cmp_int_ult' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX1-LABEL: 'cmp_int_ult' +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; XOPAVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; XOPAVX2-LABEL: 'cmp_int_ult' +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SLM-LABEL: 'cmp_int_ult' +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I8 = icmp ult i8 %arg8, zeroinitializer + %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer + %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer + %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer + %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer + + %I16 = icmp ult i16 %arg16, zeroinitializer + %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer + %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer + %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer + %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer + + %I32 = icmp ult i32 %arg32, zeroinitializer + %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer + %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer + %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer + %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer + + %I64 = icmp ult i64 %arg64, zeroinitializer + %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer + %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer + %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer + %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer + + ret i32 undef +} + From c16d0d14de40559eb3845a88b6434550dd1dcf77 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 16:00:49 +0000 Subject: [PATCH 115/351] [SimplifyCFG] Add test coverage for #80122 --- .../Transforms/SimplifyCFG/X86/PR80122.ll | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll diff --git a/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll b/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll new file mode 100644 index 0000000000000..c5570b9e5271d --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -S -passes=simplifycfg -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -S -passes=simplifycfg -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: opt < %s -S -passes=simplifycfg -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -S -passes=simplifycfg -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define zeroext i1 @cmp128(<2 x i64> %x, <2 x i64> %y) { +; SSE-LABEL: define zeroext i1 @cmp128( +; SSE-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; SSE-NEXT: entry: +; SSE-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer +; SSE-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 +; SSE-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 +; SSE-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; SSE: land.rhs: +; SSE-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer +; SSE-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 +; SSE-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 +; SSE-NEXT: br label [[LAND_END]] +; SSE: land.end: +; SSE-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; SSE-NEXT: ret i1 [[TMP2]] +; +; AVX2-LABEL: define zeroext i1 @cmp128( +; AVX2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX2-NEXT: entry: +; AVX2-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer +; AVX2-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 +; AVX2-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 +; AVX2-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; AVX2: land.rhs: +; AVX2-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 +; AVX2-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 +; AVX2-NEXT: br label [[LAND_END]] +; AVX2: land.end: +; AVX2-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; AVX2-NEXT: ret i1 [[TMP2]] +; +; AVX512-LABEL: define zeroext i1 @cmp128( +; AVX512-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX512-NEXT: entry: +; AVX512-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer +; AVX512-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 +; AVX512-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 +; AVX512-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer +; AVX512-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 +; AVX512-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 +; AVX512-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false +; AVX512-NEXT: ret i1 [[TMP2]] +; +entry: + %cmp = icmp ne <2 x i64> %x, zeroinitializer + %0 = bitcast <2 x i1> %cmp to i2 + %.not = icmp eq i2 %0, 0 + br i1 %.not, label %land.rhs, label %land.end + +land.rhs: + %cmp2 = icmp ne <2 x i64> %y, zeroinitializer + %1 = bitcast <2 x i1> %cmp2 to i2 + %.not9 = icmp eq i2 %1, 0 + br label %land.end + +land.end: + %2 = phi i1 [ false, %entry ], [ %.not9, %land.rhs ] + ret i1 %2 +} + +define zeroext i1 @cmp256(<4 x i64> %x, <4 x i64> %y) { +; SSE-LABEL: define zeroext i1 @cmp256( +; SSE-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { +; SSE-NEXT: entry: +; SSE-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer +; SSE-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 +; SSE-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP0]], 0 +; SSE-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; SSE: land.rhs: +; SSE-NEXT: [[CMP2:%.*]] = icmp ne <4 x i64> [[Y]], zeroinitializer +; SSE-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[CMP2]] to i4 +; SSE-NEXT: [[DOTNOT9:%.*]] = icmp eq i4 [[TMP1]], 0 +; SSE-NEXT: br label [[LAND_END]] +; SSE: land.end: +; SSE-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; SSE-NEXT: ret i1 [[TMP2]] +; +; AVX2-LABEL: define zeroext i1 @cmp256( +; AVX2-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { +; AVX2-NEXT: entry: +; AVX2-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer +; AVX2-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 +; AVX2-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP0]], 0 +; AVX2-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; AVX2: land.rhs: +; AVX2-NEXT: [[CMP2:%.*]] = icmp ne <4 x i64> [[Y]], zeroinitializer +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[CMP2]] to i4 +; AVX2-NEXT: [[DOTNOT9:%.*]] = icmp eq i4 [[TMP1]], 0 +; AVX2-NEXT: br label [[LAND_END]] +; AVX2: land.end: +; AVX2-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; AVX2-NEXT: ret i1 [[TMP2]] +; +; AVX512-LABEL: define zeroext i1 @cmp256( +; AVX512-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: entry: +; AVX512-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer +; AVX512-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 +; AVX512-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP0]], 0 +; AVX512-NEXT: [[CMP2:%.*]] = icmp ne <4 x i64> [[Y]], zeroinitializer +; AVX512-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[CMP2]] to i4 +; AVX512-NEXT: [[DOTNOT9:%.*]] = icmp eq i4 [[TMP1]], 0 +; AVX512-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false +; AVX512-NEXT: ret i1 [[TMP2]] +; +entry: + %cmp = icmp ne <4 x i64> %x, zeroinitializer + %0 = bitcast <4 x i1> %cmp to i4 + %.not = icmp eq i4 %0, 0 + br i1 %.not, label %land.rhs, label %land.end + +land.rhs: + %cmp2 = icmp ne <4 x i64> %y, zeroinitializer + %1 = bitcast <4 x i1> %cmp2 to i4 + %.not9 = icmp eq i4 %1, 0 + br label %land.end + +land.end: + %2 = phi i1 [ false, %entry ], [ %.not9, %land.rhs ] + ret i1 %2 +} + +define zeroext i1 @cmp512(<8 x i64> %x, <8 x i64> %y) { +; SSE-LABEL: define zeroext i1 @cmp512( +; SSE-SAME: <8 x i64> [[X:%.*]], <8 x i64> [[Y:%.*]]) #[[ATTR0]] { +; SSE-NEXT: entry: +; SSE-NEXT: [[CMP:%.*]] = icmp ne <8 x i64> [[X]], zeroinitializer +; SSE-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 +; SSE-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP0]], 0 +; SSE-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; SSE: land.rhs: +; SSE-NEXT: [[CMP2:%.*]] = icmp ne <8 x i64> [[Y]], zeroinitializer +; SSE-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[CMP2]] to i8 +; SSE-NEXT: [[DOTNOT9:%.*]] = icmp eq i8 [[TMP1]], 0 +; SSE-NEXT: br label [[LAND_END]] +; SSE: land.end: +; SSE-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; SSE-NEXT: ret i1 [[TMP2]] +; +; AVX2-LABEL: define zeroext i1 @cmp512( +; AVX2-SAME: <8 x i64> [[X:%.*]], <8 x i64> [[Y:%.*]]) #[[ATTR0]] { +; AVX2-NEXT: entry: +; AVX2-NEXT: [[CMP:%.*]] = icmp ne <8 x i64> [[X]], zeroinitializer +; AVX2-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 +; AVX2-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP0]], 0 +; AVX2-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; AVX2: land.rhs: +; AVX2-NEXT: [[CMP2:%.*]] = icmp ne <8 x i64> [[Y]], zeroinitializer +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[CMP2]] to i8 +; AVX2-NEXT: [[DOTNOT9:%.*]] = icmp eq i8 [[TMP1]], 0 +; AVX2-NEXT: br label [[LAND_END]] +; AVX2: land.end: +; AVX2-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; AVX2-NEXT: ret i1 [[TMP2]] +; +; AVX512-LABEL: define zeroext i1 @cmp512( +; AVX512-SAME: <8 x i64> [[X:%.*]], <8 x i64> [[Y:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: entry: +; AVX512-NEXT: [[CMP:%.*]] = icmp ne <8 x i64> [[X]], zeroinitializer +; AVX512-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 +; AVX512-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP0]], 0 +; AVX512-NEXT: [[CMP2:%.*]] = icmp ne <8 x i64> [[Y]], zeroinitializer +; AVX512-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[CMP2]] to i8 +; AVX512-NEXT: [[DOTNOT9:%.*]] = icmp eq i8 [[TMP1]], 0 +; AVX512-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false +; AVX512-NEXT: ret i1 [[TMP2]] +; +entry: + %cmp = icmp ne <8 x i64> %x, zeroinitializer + %0 = bitcast <8 x i1> %cmp to i8 + %.not = icmp eq i8 %0, 0 + br i1 %.not, label %land.rhs, label %land.end + +land.rhs: + %cmp2 = icmp ne <8 x i64> %y, zeroinitializer + %1 = bitcast <8 x i1> %cmp2 to i8 + %.not9 = icmp eq i8 %1, 0 + br label %land.end + +land.end: + %2 = phi i1 [ false, %entry ], [ %.not9, %land.rhs ] + ret i1 %2 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE2: {{.*}} +; SSE4: {{.*}} From 9978f6a10f37d12e1eecad0d4bfacd350d933ed7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 21 Feb 2024 16:19:17 +0000 Subject: [PATCH 116/351] [CostModel][X86] Reduce the extra costs for ICMP complex predicates when an operand is constant In most cases, SETCC lowering will be able to simplify/commute the comparison by adjusting the constant. TODO: We still need to adjust ExtraCost based on CostKind Fixes #80122 --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 13 +- llvm/test/Analysis/CostModel/X86/icmp0.ll | 1668 ++++++++--------- .../Transforms/SimplifyCFG/X86/PR80122.ll | 122 +- 3 files changed, 892 insertions(+), 911 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index f91e13f997f78..18bf32fe1acaa 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3090,6 +3090,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, InstructionCost ExtraCost = 0; if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { // Some vector comparison predicates cost extra instructions. + // TODO: Adjust ExtraCost based on CostKind? // TODO: Should we invert this and assume worst case cmp costs // and reduce for particular predicates? if (MTy.isVector() && @@ -3102,21 +3103,25 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Pred == CmpInst::BAD_FCMP_PREDICATE)) Pred = cast(I)->getPredicate(); + bool CmpWithConstant = false; + if (auto *CmpInstr = dyn_cast_or_null(I)) + CmpWithConstant = isa(CmpInstr->getOperand(1)); + switch (Pred) { case CmpInst::Predicate::ICMP_NE: // xor(cmpeq(x,y),-1) - ExtraCost = 1; + ExtraCost = CmpWithConstant ? 0 : 1; break; case CmpInst::Predicate::ICMP_SGE: case CmpInst::Predicate::ICMP_SLE: // xor(cmpgt(x,y),-1) - ExtraCost = 1; + ExtraCost = CmpWithConstant ? 0 : 1; break; case CmpInst::Predicate::ICMP_ULT: case CmpInst::Predicate::ICMP_UGT: // cmpgt(xor(x,signbit),xor(y,signbit)) // xor(cmpeq(pmaxu(x,y),x),-1) - ExtraCost = 2; + ExtraCost = CmpWithConstant ? 1 : 2; break; case CmpInst::Predicate::ICMP_ULE: case CmpInst::Predicate::ICMP_UGE: @@ -3127,7 +3132,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, ExtraCost = 1; } else { // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) - ExtraCost = 3; + ExtraCost = CmpWithConstant ? 2 : 3; } break; case CmpInst::Predicate::FCMP_ONE: diff --git a/llvm/test/Analysis/CostModel/X86/icmp0.ll b/llvm/test/Analysis/CostModel/X86/icmp0.ll index 0d8a25207e781..db9affa40b990 100644 --- a/llvm/test/Analysis/CostModel/X86/icmp0.ll +++ b/llvm/test/Analysis/CostModel/X86/icmp0.ll @@ -322,176 +322,176 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ne' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ne' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ne' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ne' ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ne' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ne' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ne' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ne' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer @@ -553,47 +553,47 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x ; XOPAVX2-LABEL: 'cmp_int_ne' ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ne' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp ne i8 %arg8, zeroinitializer @@ -626,176 +626,176 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sge' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_sge' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_sge' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_sge' ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sge' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sge' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sge' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sge' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer @@ -857,47 +857,47 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; XOPAVX2-LABEL: 'cmp_int_sge' ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sge' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp sge i8 %arg8, zeroinitializer @@ -940,15 +940,15 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_uge' @@ -963,15 +963,15 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_uge' @@ -986,15 +986,15 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_uge' @@ -1014,10 +1014,10 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_uge' @@ -1037,10 +1037,10 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_uge' @@ -1060,10 +1060,10 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_uge' @@ -1083,10 +1083,10 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_uge' @@ -1176,9 +1176,9 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_uge' @@ -1198,10 +1198,10 @@ define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp uge i8 %arg8, zeroinitializer @@ -1538,176 +1538,176 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ugt' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ugt' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ugt' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ugt' ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ugt' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ugt' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ugt' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ugt' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer @@ -1769,47 +1769,47 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; XOPAVX2-LABEL: 'cmp_int_ugt' ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ugt' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp ugt i8 %arg8, zeroinitializer @@ -1842,176 +1842,176 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_sle' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_sle' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_sle' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_sle' ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_sle' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_sle' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_sle' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_sle' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer @@ -2073,47 +2073,47 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; XOPAVX2-LABEL: 'cmp_int_sle' ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_sle' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp sle i8 %arg8, zeroinitializer @@ -2156,15 +2156,15 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ule' @@ -2179,15 +2179,15 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ule' @@ -2202,15 +2202,15 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ule' @@ -2230,10 +2230,10 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ule' @@ -2253,10 +2253,10 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ule' @@ -2276,10 +2276,10 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ule' @@ -2299,10 +2299,10 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ule' @@ -2392,9 +2392,9 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ule' @@ -2414,10 +2414,10 @@ define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp ule i8 %arg8, zeroinitializer @@ -2754,176 +2754,176 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) { ; SSE2-LABEL: 'cmp_int_ult' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'cmp_int_ult' ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; SSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; SSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'cmp_int_ult' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'cmp_int_ult' ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; SSE41-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; SSE41-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE41-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'cmp_int_ult' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'cmp_int_ult' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'cmp_int_ult' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'cmp_int_ult' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer @@ -2985,47 +2985,47 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 ; XOPAVX2-LABEL: 'cmp_int_ult' ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; XOPAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'cmp_int_ult' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, 0 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, zeroinitializer +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I8 = icmp ult i8 %arg8, zeroinitializer diff --git a/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll b/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll index c5570b9e5271d..0971153ee1604 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/PR80122.ll @@ -5,49 +5,45 @@ ; RUN: opt < %s -S -passes=simplifycfg -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 define zeroext i1 @cmp128(<2 x i64> %x, <2 x i64> %y) { -; SSE-LABEL: define zeroext i1 @cmp128( -; SSE-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { -; SSE-NEXT: entry: -; SSE-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer -; SSE-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 -; SSE-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 -; SSE-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] -; SSE: land.rhs: -; SSE-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer -; SSE-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 -; SSE-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 -; SSE-NEXT: br label [[LAND_END]] -; SSE: land.end: -; SSE-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] -; SSE-NEXT: ret i1 [[TMP2]] +; SSE2-LABEL: define zeroext i1 @cmp128( +; SSE2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; SSE2-NEXT: entry: +; SSE2-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer +; SSE2-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 +; SSE2-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 +; SSE2-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; SSE2: land.rhs: +; SSE2-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer +; SSE2-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 +; SSE2-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 +; SSE2-NEXT: br label [[LAND_END]] +; SSE2: land.end: +; SSE2-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] +; SSE2-NEXT: ret i1 [[TMP2]] ; -; AVX2-LABEL: define zeroext i1 @cmp128( -; AVX2-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { -; AVX2-NEXT: entry: -; AVX2-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer -; AVX2-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 -; AVX2-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 -; AVX2-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] -; AVX2: land.rhs: -; AVX2-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer -; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 -; AVX2-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 -; AVX2-NEXT: br label [[LAND_END]] -; AVX2: land.end: -; AVX2-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] -; AVX2-NEXT: ret i1 [[TMP2]] +; SSE4-LABEL: define zeroext i1 @cmp128( +; SSE4-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; SSE4-NEXT: entry: +; SSE4-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer +; SSE4-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 +; SSE4-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 +; SSE4-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer +; SSE4-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 +; SSE4-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 +; SSE4-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false +; SSE4-NEXT: ret i1 [[TMP2]] ; -; AVX512-LABEL: define zeroext i1 @cmp128( -; AVX512-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { -; AVX512-NEXT: entry: -; AVX512-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer -; AVX512-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 -; AVX512-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 -; AVX512-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer -; AVX512-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 -; AVX512-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 -; AVX512-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false -; AVX512-NEXT: ret i1 [[TMP2]] +; AVX-LABEL: define zeroext i1 @cmp128( +; AVX-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP:%.*]] = icmp ne <2 x i64> [[X]], zeroinitializer +; AVX-NEXT: [[TMP0:%.*]] = bitcast <2 x i1> [[CMP]] to i2 +; AVX-NEXT: [[DOTNOT:%.*]] = icmp eq i2 [[TMP0]], 0 +; AVX-NEXT: [[CMP2:%.*]] = icmp ne <2 x i64> [[Y]], zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i1> [[CMP2]] to i2 +; AVX-NEXT: [[DOTNOT9:%.*]] = icmp eq i2 [[TMP1]], 0 +; AVX-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false +; AVX-NEXT: ret i1 [[TMP2]] ; entry: %cmp = icmp ne <2 x i64> %x, zeroinitializer @@ -68,7 +64,7 @@ land.end: define zeroext i1 @cmp256(<4 x i64> %x, <4 x i64> %y) { ; SSE-LABEL: define zeroext i1 @cmp256( -; SSE-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { +; SSE-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0:[0-9]+]] { ; SSE-NEXT: entry: ; SSE-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer ; SSE-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 @@ -83,33 +79,17 @@ define zeroext i1 @cmp256(<4 x i64> %x, <4 x i64> %y) { ; SSE-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] ; SSE-NEXT: ret i1 [[TMP2]] ; -; AVX2-LABEL: define zeroext i1 @cmp256( -; AVX2-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { -; AVX2-NEXT: entry: -; AVX2-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer -; AVX2-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 -; AVX2-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP0]], 0 -; AVX2-NEXT: br i1 [[DOTNOT]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] -; AVX2: land.rhs: -; AVX2-NEXT: [[CMP2:%.*]] = icmp ne <4 x i64> [[Y]], zeroinitializer -; AVX2-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[CMP2]] to i4 -; AVX2-NEXT: [[DOTNOT9:%.*]] = icmp eq i4 [[TMP1]], 0 -; AVX2-NEXT: br label [[LAND_END]] -; AVX2: land.end: -; AVX2-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[DOTNOT9]], [[LAND_RHS]] ] -; AVX2-NEXT: ret i1 [[TMP2]] -; -; AVX512-LABEL: define zeroext i1 @cmp256( -; AVX512-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { -; AVX512-NEXT: entry: -; AVX512-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer -; AVX512-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 -; AVX512-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP0]], 0 -; AVX512-NEXT: [[CMP2:%.*]] = icmp ne <4 x i64> [[Y]], zeroinitializer -; AVX512-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[CMP2]] to i4 -; AVX512-NEXT: [[DOTNOT9:%.*]] = icmp eq i4 [[TMP1]], 0 -; AVX512-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false -; AVX512-NEXT: ret i1 [[TMP2]] +; AVX-LABEL: define zeroext i1 @cmp256( +; AVX-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]]) #[[ATTR0]] { +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP:%.*]] = icmp ne <4 x i64> [[X]], zeroinitializer +; AVX-NEXT: [[TMP0:%.*]] = bitcast <4 x i1> [[CMP]] to i4 +; AVX-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP0]], 0 +; AVX-NEXT: [[CMP2:%.*]] = icmp ne <4 x i64> [[Y]], zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[CMP2]] to i4 +; AVX-NEXT: [[DOTNOT9:%.*]] = icmp eq i4 [[TMP1]], 0 +; AVX-NEXT: [[TMP2:%.*]] = select i1 [[DOTNOT]], i1 [[DOTNOT9]], i1 false +; AVX-NEXT: ret i1 [[TMP2]] ; entry: %cmp = icmp ne <4 x i64> %x, zeroinitializer @@ -189,7 +169,3 @@ land.end: %2 = phi i1 [ false, %entry ], [ %.not9, %land.rhs ] ret i1 %2 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} -; SSE2: {{.*}} -; SSE4: {{.*}} From 453b1a2fce3c46e866131797f876976032cff384 Mon Sep 17 00:00:00 2001 From: cmtice Date: Wed, 21 Feb 2024 08:26:05 -0800 Subject: [PATCH 117/351] [LLVM][DWARF] Refactor code for generating DWARF V5 .debug_names (#82394) [LLVM][DWARF] Refactor code for generating DWARF v5 .debug_names Refactor the code that uniques the entries and computes the bucket count for the DWARF V5 .debug_names accelerator table. --- llvm/include/llvm/BinaryFormat/Dwarf.h | 19 +++++++++++++++++++ llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp | 14 ++------------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h index 869352b35e323..44c0030251b37 100644 --- a/llvm/include/llvm/BinaryFormat/Dwarf.h +++ b/llvm/include/llvm/BinaryFormat/Dwarf.h @@ -613,6 +613,25 @@ enum AcceleratorTable { DW_hash_function_djb = 0u }; +// Uniquify the string hashes and calculate the bucket count for the +// DWARF v5 Accelerator Table. NOTE: This function effectively consumes the +// 'hashes' input parameter. +inline uint32_t getDebugNamesBucketCount(MutableArrayRef hashes, + uint32_t &uniqueHashCount) { + uint32_t BucketCount = 0; + + sort(hashes); + uniqueHashCount = llvm::unique(hashes) - hashes.begin(); + if (uniqueHashCount > 1024) + BucketCount = uniqueHashCount / 4; + else if (uniqueHashCount > 16) + BucketCount = uniqueHashCount / 2; + else + BucketCount = std::max(uniqueHashCount, 1); + + return BucketCount; +} + // Constants for the GNU pubnames/pubtypes extensions supporting gdb index. enum GDBIndexEntryKind { GIEK_NONE, diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp index 22d995a9cc3c5..23fc9b2e0410e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp @@ -33,22 +33,12 @@ using namespace llvm; void AccelTableBase::computeBucketCount() { // First get the number of unique hashes. - std::vector Uniques; + SmallVector Uniques; Uniques.reserve(Entries.size()); for (const auto &E : Entries) Uniques.push_back(E.second.HashValue); - array_pod_sort(Uniques.begin(), Uniques.end()); - std::vector::iterator P = - std::unique(Uniques.begin(), Uniques.end()); - UniqueHashCount = std::distance(Uniques.begin(), P); - - if (UniqueHashCount > 1024) - BucketCount = UniqueHashCount / 4; - else if (UniqueHashCount > 16) - BucketCount = UniqueHashCount / 2; - else - BucketCount = std::max(UniqueHashCount, 1); + BucketCount = llvm::dwarf::getDebugNamesBucketCount(Uniques, UniqueHashCount); } void AccelTableBase::finalize(AsmPrinter *Asm, StringRef Prefix) { From 13b0321e978fd95503d5f5471a0cfdcd439a5936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 21 Feb 2024 17:21:44 +0100 Subject: [PATCH 118/351] [clang][Interp][NFC] Reject unimplemented cast expressions differently Instead of asserting, emit an appropriate diagnostic. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 0b08309e4e6e0..27e0986192165 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -335,7 +335,7 @@ bool ByteCodeExprGen::VisitCastExpr(const CastExpr *CE) { return discard(SubExpr); default: - assert(false && "Cast not implemented"); + return this->emitInvalid(CE); } llvm_unreachable("Unhandled clang::CastKind enum"); } From 3f732c4141e95de829a896c38af11473377dbcd6 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 21 Feb 2024 17:28:42 +0100 Subject: [PATCH 119/351] [mlir][Transforms] Fix use-after-free in #82474 (#82504) When a `ModifyOperationRewrite` is committed, the operation may already have been erased, so `OperationName` must be cached in the rewrite object. Note: This will no longer be needed with #81757, which adds a "cleanup" method to `IRRewrite`. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 88709bb261874..4989ddc3ec94f 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -965,14 +965,14 @@ class ModifyOperationRewrite : public OperationRewrite { ModifyOperationRewrite(ConversionPatternRewriterImpl &rewriterImpl, Operation *op) : OperationRewrite(Kind::ModifyOperation, rewriterImpl, op), - loc(op->getLoc()), attrs(op->getAttrDictionary()), + name(op->getName()), loc(op->getLoc()), attrs(op->getAttrDictionary()), operands(op->operand_begin(), op->operand_end()), successors(op->successor_begin(), op->successor_end()) { if (OpaqueProperties prop = op->getPropertiesStorage()) { // Make a copy of the properties. propertiesStorage = operator new(op->getPropertiesStorageSize()); OpaqueProperties propCopy(propertiesStorage); - op->getName().initOpProperties(propCopy, /*init=*/prop); + name.initOpProperties(propCopy, /*init=*/prop); } } @@ -988,7 +988,9 @@ class ModifyOperationRewrite : public OperationRewrite { void commit() override { if (propertiesStorage) { OpaqueProperties propCopy(propertiesStorage); - op->getName().destroyOpProperties(propCopy); + // Note: The operation may have been erased in the mean time, so + // OperationName must be stored in this object. + name.destroyOpProperties(propCopy); operator delete(propertiesStorage); propertiesStorage = nullptr; } @@ -1003,13 +1005,14 @@ class ModifyOperationRewrite : public OperationRewrite { if (propertiesStorage) { OpaqueProperties propCopy(propertiesStorage); op->copyProperties(propCopy); - op->getName().destroyOpProperties(propCopy); + name.destroyOpProperties(propCopy); operator delete(propertiesStorage); propertiesStorage = nullptr; } } private: + OperationName name; LocationAttr loc; DictionaryAttr attrs; SmallVector operands; From 2cd59bdc891ab59a1abfe5205feb45791a530a47 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 21 Feb 2024 18:05:04 +0800 Subject: [PATCH 120/351] [RISCV] Add test case for miscompile in gather -> strided load combine. NFC This shows the issue in #82430, but triggers it via the widening SEW combine rather than a GEP that RISCVGatherScatterLowering doesn't detect. --- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 890707c6337fa..1724b48dd6be9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -15086,5 +15086,52 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ret <32 x i64> %x } +; FIXME: This is a miscompile triggered by the mgather -> +; riscv.masked.strided.load combine. In order for it to trigger we need either a +; strided gather that RISCVGatherScatterLowering doesn't pick up, or a new +; strided gather generated by the widening sew combine. +define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { +; RV32V-LABEL: masked_gather_widen_sew_negative_stride: +; RV32V: # %bb.0: +; RV32V-NEXT: addi a0, a0, -128 +; RV32V-NEXT: li a1, -128 +; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32V-NEXT: vlse64.v v8, (a0), a1 +; RV32V-NEXT: ret +; +; RV64V-LABEL: masked_gather_widen_sew_negative_stride: +; RV64V: # %bb.0: +; RV64V-NEXT: addi a0, a0, -128 +; RV64V-NEXT: li a1, -128 +; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64V-NEXT: vlse64.v v8, (a0), a1 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: lui a1, 16392 +; RV32ZVE32F-NEXT: addi a1, a1, 1152 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32ZVE32F-NEXT: vmv.s.x v9, a1 +; RV32ZVE32F-NEXT: vluxei8.v v8, (a0), v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi a1, a0, 128 +; RV64ZVE32F-NEXT: lw a2, 132(a0) +; RV64ZVE32F-NEXT: lw a3, 0(a0) +; RV64ZVE32F-NEXT: lw a0, 4(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vlse32.v v8, (a1), zero +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: ret + %ptrs = getelementptr i32, ptr %base, <4 x i64> + %x = call <4 x i32> @llvm.masked.gather.v4i32.v32p0(<4 x ptr> %ptrs, i32 8, <4 x i1> shufflevector(<4 x i1> insertelement(<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer), <4 x i32> poison) + ret <4 x i32> %x +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV64: {{.*}} From 2e29c91b96832504b9008be5e095f7dd640cdea0 Mon Sep 17 00:00:00 2001 From: Mogball Date: Wed, 21 Feb 2024 16:16:18 +0000 Subject: [PATCH 121/351] Revert "[Coro] [async] Disable inlining in async coroutine splitting (#80904)" This reverts commit b1ac052ab07ea091c90c2b7c89445b2bfcfa42ab. This commit breaks coroutine splitting for non-swift calling convention functions. In this example: ```ll ; ModuleID = 'repro.ll' source_filename = "stdlib/test/runtime/test_llcl.mojo" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @0 = internal constant { i32, i32 } { i32 trunc (i64 sub (i64 ptrtoint (ptr @crash to i64), i64 ptrtoint (ptr getelementptr inbounds ({ i32, i32 }, ptr @0, i32 0, i32 1) to i64)) to i32), i32 64 } define dso_local void @af_suspend_fn(ptr %0, i64 %1, ptr %2) #0 { ret void } define dso_local void @crash(ptr %0) #0 { %2 = call token @llvm.coro.id.async(i32 64, i32 8, i32 0, ptr @0) %3 = call ptr @llvm.coro.begin(token %2, ptr null) %4 = getelementptr inbounds { ptr, { ptr, ptr }, i64, { ptr, i1 }, i64, i64 }, ptr poison, i32 0, i32 0 %5 = call ptr @llvm.coro.async.resume() store ptr %5, ptr %4, align 8 %6 = call { ptr, ptr, ptr } (i32, ptr, ptr, ...) @llvm.coro.suspend.async.sl_p0p0p0s(i32 0, ptr %5, ptr @ctxt_proj_fn, ptr @af_suspend_fn, ptr poison, i64 -1, ptr poison) ret void } define dso_local ptr @ctxt_proj_fn(ptr %0) #0 { ret ptr %0 } ; Function Attrs: nomerge nounwind declare { ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0p0p0s(i32, ptr, ptr, ...) #1 ; Function Attrs: nounwind declare token @llvm.coro.id.async(i32, i32, i32, ptr) #2 ; Function Attrs: nounwind declare ptr @llvm.coro.begin(token, ptr writeonly) #2 ; Function Attrs: nomerge nounwind declare ptr @llvm.coro.async.resume() #1 attributes #0 = { "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mwaitx,+pclmul,+pku,+popcnt,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } attributes #1 = { nomerge nounwind } attributes #2 = { nounwind } ``` This verifier crashes after the `coro-split` pass with ``` cannot guarantee tail call due to mismatched parameter counts musttail call void @af_suspend_fn(ptr poison, i64 -1, ptr poison) LLVM ERROR: Broken function PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. Stack dump: 0. Program arguments: opt ../../../reduced.ll -O0 #0 0x00007f1d89645c0e __interceptor_backtrace.part.0 /build/gcc-11-XeT9lY/gcc-11-11.4.0/build/x86_64-linux-gnu/libsanitizer/asan/../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:4193:28 #1 0x0000556d94d254f7 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Support/Unix/Signals.inc:723:22 #2 0x0000556d94d19a2f llvm::sys::RunSignalHandlers() /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Support/Signals.cpp:105:20 #3 0x0000556d94d1aa42 SignalHandler(int) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Support/Unix/Signals.inc:371:36 #4 0x00007f1d88e42520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520) #5 0x00007f1d88e969fc __pthread_kill_implementation ./nptl/pthread_kill.c:44:76 #6 0x00007f1d88e969fc __pthread_kill_internal ./nptl/pthread_kill.c:78:10 #7 0x00007f1d88e969fc pthread_kill ./nptl/pthread_kill.c:89:10 #8 0x00007f1d88e42476 gsignal ./signal/../sysdeps/posix/raise.c:27:6 #9 0x00007f1d88e287f3 abort ./stdlib/abort.c:81:7 #10 0x0000556d8944be01 std::vector>::size() const /usr/include/c++/11/bits/stl_vector.h:919:40 #11 0x0000556d8944be01 bool std::operator==>(std::vector> const&, std::vector> const&) /usr/include/c++/11/bits/stl_vector.h:1893:23 #12 0x0000556d8944be01 llvm::json::operator==(llvm::json::Array const&, llvm::json::Array const&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/Support/JSON.h:572:69 #13 0x0000556d8944be01 llvm::json::operator==(llvm::json::Value const&, llvm::json::Value const&) (.cold) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Support/JSON.cpp:204:28 #14 0x0000556d949ed2bd llvm::report_fatal_error(char const*, bool) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Support/ErrorHandling.cpp:82:70 #15 0x0000556d8e37e876 llvm::SmallVectorBase::size() const /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallVector.h:91:32 #16 0x0000556d8e37e876 llvm::SmallVectorTemplateCommon::end() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallVector.h:282:41 #17 0x0000556d8e37e876 llvm::SmallVector::~SmallVector() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallVector.h:1215:24 #18 0x0000556d8e37e876 llvm::DiagnosticInfoOptimizationBase::~DiagnosticInfoOptimizationBase() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h:413:7 #19 0x0000556d8e37e876 llvm::DiagnosticInfoIROptimization::~DiagnosticInfoIROptimization() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h:622:7 #20 0x0000556d8e37e876 llvm::OptimizationRemark::~OptimizationRemark() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h:689:7 #21 0x0000556d8e37e876 operator() /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp:2213:14 #22 0x0000556d8e37e876 emit > /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/Analysis/OptimizationRemarkEmitter.h:83:12 #23 0x0000556d8e37e876 llvm::CoroSplitPass::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp:2212:13 #24 0x0000556d8c36ecb1 llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:3 #25 0x0000556d91c1a84f llvm::PassManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:90:12 #26 0x0000556d8c3690d1 llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>, llvm::AnalysisManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:3 #27 0x0000556d91c2162d llvm::ModuleToPostOrderCGSCCPassAdaptor::run(llvm::Module&, llvm::AnalysisManager&) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:278:18 #28 0x0000556d8c369035 llvm::detail::PassModel>::run(llvm::Module&, llvm::AnalysisManager&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:3 #29 0x0000556d9457abc5 llvm::PassManager>::run(llvm::Module&, llvm::AnalysisManager&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/PassManager.h:247:20 #30 0x0000556d8e30979e llvm::CoroConditionalWrapper::run(llvm::Module&, llvm::AnalysisManager&) /home/ubuntu/modular/third-party/llvm-project/llvm/lib/Transforms/Coroutines/CoroConditionalWrapper.cpp:19:74 #31 0x0000556d8c365755 llvm::detail::PassModel>::run(llvm::Module&, llvm::AnalysisManager&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:3 #32 0x0000556d9457abc5 llvm::PassManager>::run(llvm::Module&, llvm::AnalysisManager&) /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/PassManager.h:247:20 #33 0x0000556d89818556 llvm::SmallPtrSetImplBase::isSmall() const /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallPtrSet.h:196:33 #34 0x0000556d89818556 llvm::SmallPtrSetImplBase::~SmallPtrSetImplBase() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallPtrSet.h:84:17 #35 0x0000556d89818556 llvm::SmallPtrSetImpl::~SmallPtrSetImpl() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallPtrSet.h:321:7 #36 0x0000556d89818556 llvm::SmallPtrSet::~SmallPtrSet() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/ADT/SmallPtrSet.h:427:7 #37 0x0000556d89818556 llvm::PreservedAnalyses::~PreservedAnalyses() /home/ubuntu/modular/third-party/llvm-project/llvm/include/llvm/IR/Analysis.h:109:7 #38 0x0000556d89818556 llvm::runPassPipeline(llvm::StringRef, llvm::Module&, llvm::TargetMachine*, llvm::TargetLibraryInfoImpl*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::ToolOutputFile*, llvm::StringRef, llvm::ArrayRef, llvm::ArrayRef>, llvm::opt_tool::OutputKind, llvm::opt_tool::VerifierKind, bool, bool, bool, bool, bool, bool, bool) /home/ubuntu/modular/third-party/llvm-project/llvm/tools/opt/NewPMDriver.cpp:532:10 #39 0x0000556d897e3939 optMain /home/ubuntu/modular/third-party/llvm-project/llvm/tools/opt/optdriver.cpp:737:27 #40 0x0000556d89455461 main /home/ubuntu/modular/third-party/llvm-project/llvm/tools/opt/opt.cpp:25:33 #41 0x00007f1d88e29d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16 #42 0x00007f1d88e29e40 call_init ./csu/../csu/libc-start.c:128:20 #43 0x00007f1d88e29e40 __libc_start_main ./csu/../csu/libc-start.c:379:5 #44 0x0000556d897b6335 _start (/home/ubuntu/modular/.derived/third-party/llvm-project/build-relwithdebinfo-asan/bin/opt+0x150c335) Aborted (core dumped) --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 81 +++++---- ...o-async-addr-lifetime-infinite-loop-bug.ll | 8 +- .../coro-async-addr-lifetime-start-bug.ll | 6 +- .../Coroutines/coro-async-dyn-align.ll | 10 +- .../Coroutines/coro-async-mutal-recursive.ll | 158 ------------------ .../Coroutines/coro-async-unreachable.ll | 10 +- llvm/test/Transforms/Coroutines/coro-async.ll | 76 ++++----- .../Transforms/Coroutines/swift-async-dbg.ll | 30 ++-- 8 files changed, 117 insertions(+), 262 deletions(-) delete mode 100644 llvm/test/Transforms/Coroutines/coro-async-mutal-recursive.ll diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 4d0c221b47afa..e6b7c9ae90945 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -117,8 +117,8 @@ class CoroCloner { /// Create a cloner for a switch lowering. CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape, Kind FKind) - : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape), - FKind(FKind), Builder(OrigF.getContext()) { + : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape), FKind(FKind), + Builder(OrigF.getContext()) { assert(Shape.ABI == coro::ABI::Switch); } @@ -170,8 +170,7 @@ class CoroCloner { static void maybeFreeRetconStorage(IRBuilder<> &Builder, const coro::Shape &Shape, Value *FramePtr, CallGraph *CG) { - assert(Shape.ABI == coro::ABI::Retcon || - Shape.ABI == coro::ABI::RetconOnce); + assert(Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce); if (Shape.RetconLowering.IsFrameInlineInStorage) return; @@ -208,12 +207,17 @@ static bool replaceCoroEndAsync(AnyCoroEndInst *End) { // Insert the return instruction. Builder.SetInsertPoint(End); Builder.CreateRetVoid(); + InlineFunctionInfo FnInfo; // Remove the rest of the block, by splitting it into an unreachable block. auto *BB = End->getParent(); BB->splitBasicBlock(End); BB->getTerminator()->eraseFromParent(); + auto InlineRes = InlineFunction(*MustTailCall, FnInfo); + assert(InlineRes.isSuccess() && "Expected inlining to succeed"); + (void)InlineRes; + // We have cleaned up the coro.end block above. return false; } @@ -264,7 +268,7 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, if (auto *RetStructTy = dyn_cast(RetTy)) { assert(RetStructTy->getNumElements() == NumReturns && - "numbers of returns should match resume function singature"); + "numbers of returns should match resume function singature"); Value *ReturnValue = UndefValue::get(RetStructTy); unsigned Idx = 0; for (Value *RetValEl : CoroResults->return_values()) @@ -277,7 +281,8 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, assert(NumReturns == 1); Builder.CreateRet(*CoroResults->retval_begin()); } - CoroResults->replaceAllUsesWith(ConstantTokenNone::get(CoroResults->getContext())); + CoroResults->replaceAllUsesWith( + ConstantTokenNone::get(CoroResults->getContext())); CoroResults->eraseFromParent(); break; } @@ -291,7 +296,7 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, auto RetTy = Shape.getResumeFunctionType()->getReturnType(); auto RetStructTy = dyn_cast(RetTy); PointerType *ContinuationTy = - cast(RetStructTy ? RetStructTy->getElementType(0) : RetTy); + cast(RetStructTy ? RetStructTy->getElementType(0) : RetTy); Value *ReturnValue = ConstantPointerNull::get(ContinuationTy); if (RetStructTy) { @@ -480,11 +485,12 @@ void CoroCloner::replaceRetconOrAsyncSuspendUses() { Shape.ABI == coro::ABI::Async); auto NewS = VMap[ActiveSuspend]; - if (NewS->use_empty()) return; + if (NewS->use_empty()) + return; // Copy out all the continuation arguments after the buffer pointer into // an easily-indexed data structure for convenience. - SmallVector Args; + SmallVector Args; // The async ABI includes all arguments -- including the first argument. bool IsAsyncABI = Shape.ABI == coro::ABI::Async; for (auto I = IsAsyncABI ? NewF->arg_begin() : std::next(NewF->arg_begin()), @@ -511,7 +517,8 @@ void CoroCloner::replaceRetconOrAsyncSuspendUses() { } // If we have no remaining uses, we're done. - if (NewS->use_empty()) return; + if (NewS->use_empty()) + return; // Otherwise, we need to create an aggregate. Value *Agg = PoisonValue::get(NewS->getType()); @@ -549,7 +556,8 @@ void CoroCloner::replaceCoroSuspends() { for (AnyCoroSuspendInst *CS : Shape.CoroSuspends) { // The active suspend was handled earlier. - if (CS == ActiveSuspend) continue; + if (CS == ActiveSuspend) + continue; auto *MappedCS = cast(VMap[CS]); MappedCS->replaceAllUsesWith(SuspendResult); @@ -707,7 +715,7 @@ void CoroCloner::replaceEntryBlock() { // In switch-lowering, we built a resume-entry block in the original // function. Make the entry block branch to this. auto *SwitchBB = - cast(VMap[Shape.SwitchLowering.ResumeEntryBlock]); + cast(VMap[Shape.SwitchLowering.ResumeEntryBlock]); Builder.CreateBr(SwitchBB); break; } @@ -1055,7 +1063,7 @@ void CoroCloner::create() { // to suppress deallocation code. if (Shape.ABI == coro::ABI::Switch) coro::replaceCoroFree(cast(VMap[Shape.CoroBegin->getId()]), - /*Elide=*/ FKind == CoroCloner::Kind::SwitchCleanup); + /*Elide=*/FKind == CoroCloner::Kind::SwitchCleanup); } static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) { @@ -1842,8 +1850,13 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape, SmallVector Args(Suspend->args()); auto FnArgs = ArrayRef(Args).drop_front( CoroSuspendAsyncInst::MustTailCallFuncArg + 1); - coro::createMustTailCall(Suspend->getDebugLoc(), Fn, TTI, FnArgs, Builder); + auto *TailCall = coro::createMustTailCall(Suspend->getDebugLoc(), Fn, TTI, + FnArgs, Builder); Builder.CreateRetVoid(); + InlineFunctionInfo FnInfo; + auto InlineRes = InlineFunction(*TailCall, FnInfo); + assert(InlineRes.isSuccess() && "Expected inlining to succeed"); + (void)InlineRes; // Replace the lvm.coro.async.resume intrisic call. replaceAsyncResumeFunction(Suspend, Continuation); @@ -1860,8 +1873,7 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape, static void splitRetconCoroutine(Function &F, coro::Shape &Shape, SmallVectorImpl &Clones) { - assert(Shape.ABI == coro::ABI::Retcon || - Shape.ABI == coro::ABI::RetconOnce); + assert(Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce); assert(Clones.empty()); // Reset various things that the optimizer might have decided it @@ -1887,7 +1899,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape, // FIXME: pass the required alignment RawFramePtr = Shape.emitAlloc(Builder, Builder.getInt64(Size), nullptr); RawFramePtr = - Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType()); + Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType()); // Stash the allocated frame pointer in the continuation storage. Builder.CreateStore(RawFramePtr, Id->getStorage()); @@ -1927,8 +1939,8 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape, // Create the unified return block. if (!ReturnBB) { // Place it before the first suspend. - ReturnBB = BasicBlock::Create(F.getContext(), "coro.return", &F, - NewSuspendBB); + ReturnBB = + BasicBlock::Create(F.getContext(), "coro.return", &F, NewSuspendBB); Shape.RetconLowering.ReturnBlock = ReturnBB; IRBuilder<> Builder(ReturnBB); @@ -1942,8 +1954,8 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape, // Next, all the directly-yielded values. for (auto *ResultTy : Shape.getRetconResultTypes()) - ReturnPHIs.push_back(Builder.CreatePHI(ResultTy, - Shape.CoroSuspends.size())); + ReturnPHIs.push_back( + Builder.CreatePHI(ResultTy, Shape.CoroSuspends.size())); // Build the return value. auto RetTy = F.getReturnType(); @@ -1952,9 +1964,9 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape, // We can't rely on the types matching up because that type would // have to be infinite. auto CastedContinuationTy = - (ReturnPHIs.size() == 1 ? RetTy : RetTy->getStructElementType(0)); + (ReturnPHIs.size() == 1 ? RetTy : RetTy->getStructElementType(0)); auto *CastedContinuation = - Builder.CreateBitCast(ReturnPHIs[0], CastedContinuationTy); + Builder.CreateBitCast(ReturnPHIs[0], CastedContinuationTy); Value *RetV; if (ReturnPHIs.size() == 1) { @@ -1988,17 +2000,18 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape, } namespace { - class PrettyStackTraceFunction : public PrettyStackTraceEntry { - Function &F; - public: - PrettyStackTraceFunction(Function &F) : F(F) {} - void print(raw_ostream &OS) const override { - OS << "While splitting coroutine "; - F.printAsOperand(OS, /*print type*/ false, F.getParent()); - OS << "\n"; - } - }; -} +class PrettyStackTraceFunction : public PrettyStackTraceEntry { + Function &F; + +public: + PrettyStackTraceFunction(Function &F) : F(F) {} + void print(raw_ostream &OS) const override { + OS << "While splitting coroutine "; + F.printAsOperand(OS, /*print type*/ false, F.getParent()); + OS << "\n"; + } +}; +} // namespace static coro::Shape splitCoroutine(Function &F, SmallVectorImpl &Clones, diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll index 4960709932948..07b3bd8fa94ac 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll @@ -22,8 +22,8 @@ declare void @my_other_async_function(ptr %async.ctxt) i32 128 ; Initial async context size without space for frame }> -define swifttailcc void @my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt) alwaysinline { - tail call swifttailcc void %fnPtr(ptr %async.ctxt) +define swiftcc void @my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt) { + tail call swiftcc void %fnPtr(ptr %async.ctxt) ret void } @@ -37,12 +37,12 @@ entry: ; The address of alloca escapes but the analysis based on lifetimes fails to see ; that it can't localize this alloca. -; CHECK: define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt) { +; CHECK: define swiftcc void @my_async_function(ptr swiftasync %async.ctxt) { ; CHECK: entry: ; CHECK-NOT: ret ; CHECK-NOT: [[ESCAPED_ADDR:%.*]] = alloca i64, align 8 ; CHECK: ret -define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt) { +define swiftcc void @my_async_function(ptr swiftasync %async.ctxt) { entry: %escaped_addr = alloca i64 diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll index 42377285f77ca..2306b72a0055f 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll @@ -22,8 +22,8 @@ declare void @my_other_async_function(ptr %async.ctxt) i32 128 ; Initial async context size without space for frame }> -define swifttailcc void @my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt) alwaysinline { - tail call swifttailcc void %fnPtr(ptr %async.ctxt) +define swiftcc void @my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt) { + tail call swiftcc void %fnPtr(ptr %async.ctxt) ret void } @@ -36,7 +36,7 @@ entry: ret ptr %resume_ctxt } -define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt) { +define swiftcc void @my_async_function(ptr swiftasync %async.ctxt) { entry: %escaped_addr = alloca i64 diff --git a/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll b/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll index 567977ea1476d..040c9881c1ab3 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll @@ -33,12 +33,12 @@ declare swiftcc void @asyncReturn(ptr) declare swiftcc void @asyncSuspend(ptr) declare {ptr} @llvm.coro.suspend.async(i32, ptr, ptr, ...) -define swifttailcc void @my_async_function.my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt) alwaysinline { - musttail call swifttailcc void %fnPtr(ptr %async.ctxt) +define swiftcc void @my_async_function.my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt) { + tail call swiftcc void %fnPtr(ptr %async.ctxt) ret void } -define ptr @__swift_async_resume_project_context(ptr %ctxt) alwaysinline { +define ptr @__swift_async_resume_project_context(ptr %ctxt) { entry: %resume_ctxt = load ptr, ptr %ctxt, align 8 ret ptr %resume_ctxt @@ -46,7 +46,7 @@ entry: ; CHECK: %my_async_function.Frame = type { i64, [48 x i8], i64, i64, [16 x i8], ptr, i64, ptr } -; CHECK: define swifttailcc void @my_async_function +; CHECK: define swiftcc void @my_async_function ; CHECK: [[T0:%.*]] = getelementptr inbounds %my_async_function.Frame, ptr %async.ctx.frameptr, i32 0, i32 3 ; CHECK: [[T1:%.*]] = ptrtoint ptr [[T0]] to i64 ; CHECK: [[T2:%.*]] = add i64 [[T1]], 31 @@ -60,7 +60,7 @@ entry: ; CHECK: store i64 2, ptr [[T4]] ; CHECK: store i64 3, ptr [[T9]] -define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt) presplitcoroutine { +define swiftcc void @my_async_function(ptr swiftasync %async.ctxt) presplitcoroutine { entry: %tmp = alloca i64, align 8 %tmp2 = alloca i64, align 16 diff --git a/llvm/test/Transforms/Coroutines/coro-async-mutal-recursive.ll b/llvm/test/Transforms/Coroutines/coro-async-mutal-recursive.ll deleted file mode 100644 index 4931fe998daa6..0000000000000 --- a/llvm/test/Transforms/Coroutines/coro-async-mutal-recursive.ll +++ /dev/null @@ -1,158 +0,0 @@ -; RUN: opt < %s -passes='default' -S | FileCheck --check-prefixes=CHECK %s -; RUN: opt < %s -O0 -S | FileCheck --check-prefixes=CHECK-O0 %s - - -; CHECK-NOT: llvm.coro.suspend.async -; CHECK-O0-NOT: llvm.coro.suspend.async - -; This test used to crash during updating the call graph in coro splitting. - -target datalayout = "p:64:64:64" - -%swift.async_func_pointer = type <{ i32, i32 }> - -@"$s1d3fooyySbYaFTu" = hidden global %swift.async_func_pointer <{ i32 trunc (i64 sub (i64 ptrtoint (ptr @"$s1d3fooyySbYaF" to i64), i64 ptrtoint (ptr @"$s1d3fooyySbYaFTu" to i64)) to i32), i32 16 }> -@"$s1d3baryySbYaFTu" = hidden global %swift.async_func_pointer <{ i32 trunc (i64 sub (i64 ptrtoint (ptr @"$s1d3baryySbYaF" to i64), i64 ptrtoint (ptr @"$s1d3baryySbYaFTu" to i64)) to i32), i32 16 }> - -define swifttailcc void @"$s1d3fooyySbYaF"(ptr swiftasync %0, i1 %1) { -entry: - %2 = alloca ptr, align 8 - %c.debug = alloca i1, align 8 - %3 = call token @llvm.coro.id.async(i32 16, i32 16, i32 0, ptr @"$s1d3fooyySbYaFTu") - %4 = call ptr @llvm.coro.begin(token %3, ptr null) - store ptr %0, ptr %2, align 8 - call void @llvm.memset.p0.i64(ptr align 8 %c.debug, i8 0, i64 1, i1 false) - store i1 %1, ptr %c.debug, align 8 - call void asm sideeffect "", "r"(ptr %c.debug) - %5 = load i32, ptr getelementptr inbounds (%swift.async_func_pointer, ptr @"$s1d3baryySbYaFTu", i32 0, i32 1), align 8 - %6 = zext i32 %5 to i64 - %7 = call swiftcc ptr @swift_task_alloc(i64 %6) #4 - call void @llvm.lifetime.start.p0(i64 -1, ptr %7) - %8 = load ptr, ptr %2, align 8 - %9 = getelementptr inbounds <{ ptr, ptr }>, ptr %7, i32 0, i32 0 - store ptr %8, ptr %9, align 8 - %10 = call ptr @llvm.coro.async.resume() - %11 = getelementptr inbounds <{ ptr, ptr }>, ptr %7, i32 0, i32 1 - store ptr %10, ptr %11, align 8 - %12 = call { ptr } (i32, ptr, ptr, ...) @llvm.coro.suspend.async.sl_p0s(i32 0, ptr %10, ptr @__swift_async_resume_project_context, ptr @"$s1d3fooyySbYaF.0", ptr @"$s1d3baryySbYaF", ptr %7, i1 %1) - %13 = extractvalue { ptr } %12, 0 - %14 = call ptr @__swift_async_resume_project_context(ptr %13) - store ptr %14, ptr %2, align 8 - call swiftcc void @swift_task_dealloc(ptr %7) #4 - call void @llvm.lifetime.end.p0(i64 -1, ptr %7) - %15 = load ptr, ptr %2, align 8 - %16 = getelementptr inbounds <{ ptr, ptr }>, ptr %15, i32 0, i32 1 - %17 = load ptr, ptr %16, align 8 - %18 = load ptr, ptr %2, align 8 - %19 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %4, i1 false, ptr @"$s1d3fooyySbYaF.0.1", ptr %17, ptr %18) - unreachable -} - -declare token @llvm.coro.id.async(i32, i32, i32, ptr) #1 - -declare void @llvm.trap() #2 - -declare ptr @llvm.coro.begin(token, ptr) #1 - -declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1 immarg) #3 - -define hidden swifttailcc void @"$s1d3baryySbYaF"(ptr swiftasync %0, i1 %1) { -entry: - %2 = alloca ptr, align 8 - %c.debug = alloca i1, align 8 - %3 = call token @llvm.coro.id.async(i32 16, i32 16, i32 0, ptr @"$s1d3baryySbYaFTu") - %4 = call ptr @llvm.coro.begin(token %3, ptr null) - store ptr %0, ptr %2, align 8 - call void @llvm.memset.p0.i64(ptr align 8 %c.debug, i8 0, i64 1, i1 false) - store i1 %1, ptr %c.debug, align 8 - call void asm sideeffect "", "r"(ptr %c.debug) - br i1 %1, label %5, label %17 - -5: ; preds = %entry - %6 = xor i1 %1, true - %7 = load i32, ptr getelementptr inbounds (%swift.async_func_pointer, ptr @"$s1d3fooyySbYaFTu", i32 0, i32 1), align 8 - %8 = zext i32 %7 to i64 - %9 = call swiftcc ptr @swift_task_alloc(i64 %8) #4 - call void @llvm.lifetime.start.p0(i64 -1, ptr %9) - %10 = load ptr, ptr %2, align 8 - %11 = getelementptr inbounds <{ ptr, ptr }>, ptr %9, i32 0, i32 0 - store ptr %10, ptr %11, align 8 - %12 = call ptr @llvm.coro.async.resume() - %13 = getelementptr inbounds <{ ptr, ptr }>, ptr %9, i32 0, i32 1 - store ptr %12, ptr %13, align 8 - %14 = call { ptr } (i32, ptr, ptr, ...) @llvm.coro.suspend.async.sl_p0s(i32 0, ptr %12, ptr @__swift_async_resume_project_context, ptr @"$s1d3baryySbYaF.0.2", ptr @"$s1d3fooyySbYaF", ptr %9, i1 %6) - %15 = extractvalue { ptr } %14, 0 - %16 = call ptr @__swift_async_resume_project_context(ptr %15) - store ptr %16, ptr %2, align 8 - call swiftcc void @swift_task_dealloc(ptr %9) #4 - call void @llvm.lifetime.end.p0(i64 -1, ptr %9) - br label %18 - -17: ; preds = %entry - br label %18 - -18: ; preds = %5, %17 - %19 = load ptr, ptr %2, align 8 - %20 = getelementptr inbounds <{ ptr, ptr }>, ptr %19, i32 0, i32 1 - %21 = load ptr, ptr %20, align 8 - %22 = load ptr, ptr %2, align 8 - %23 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %4, i1 false, ptr @"$s1d3baryySbYaF.0", ptr %21, ptr %22) - unreachable -} - -declare swiftcc ptr @swift_task_alloc(i64) #4 - -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #5 - -declare ptr @llvm.coro.async.resume() #6 - -define linkonce_odr hidden ptr @__swift_async_resume_project_context(ptr %0) #7 { -entry: - %1 = load ptr, ptr %0, align 8 - %2 = call ptr @llvm.swift.async.context.addr() - store ptr %1, ptr %2, align 8 - ret ptr %1 -} - -declare ptr @llvm.swift.async.context.addr() #1 - -define internal swifttailcc void @"$s1d3fooyySbYaF.0"(ptr %0, ptr %1, i1 %2) #8 { -entry: - musttail call swifttailcc void %0(ptr swiftasync %1, i1 %2) - ret void -} - -declare { ptr } @llvm.coro.suspend.async.sl_p0s(i32, ptr, ptr, ...) #6 - -declare swiftcc void @swift_task_dealloc(ptr) #4 - -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #5 - -define internal swifttailcc void @"$s1d3fooyySbYaF.0.1"(ptr %0, ptr %1) #8 { -entry: - musttail call swifttailcc void %0(ptr swiftasync %1) - ret void -} - -declare i1 @llvm.coro.end.async(ptr, i1, ...) #1 - -define internal swifttailcc void @"$s1d3baryySbYaF.0"(ptr %0, ptr %1) #8 { -entry: - musttail call swifttailcc void %0(ptr swiftasync %1) - ret void -} - -define internal swifttailcc void @"$s1d3baryySbYaF.0.2"(ptr %0, ptr %1, i1 %2) #8 { -entry: - musttail call swifttailcc void %0(ptr swiftasync %1, i1 %2) - ret void -} - -attributes #1 = { nounwind } -attributes #2 = { cold noreturn nounwind } -attributes #3 = { nocallback nofree nounwind willreturn} -attributes #4 = { nounwind } -attributes #5 = { nocallback nofree nosync nounwind willreturn } -attributes #6 = { nomerge nounwind } -attributes #7 = { alwaysinline nounwind } -attributes #8 = { alwaysinline nounwind } diff --git a/llvm/test/Transforms/Coroutines/coro-async-unreachable.ll b/llvm/test/Transforms/Coroutines/coro-async-unreachable.ll index ed4f526b8ed98..79ef8939b0ecc 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-unreachable.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-unreachable.ll @@ -13,8 +13,8 @@ target datalayout = "p:64:64:64" declare void @my_other_async_function(ptr %async.ctxt) ; Function that implements the dispatch to the callee function. -define swifttailcc void @my_async_function.my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt, ptr %task, ptr %actor) alwaysinline { - musttail call swifttailcc void %fnPtr(ptr %async.ctxt, ptr %task, ptr %actor) +define swiftcc void @my_async_function.my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt, ptr %task, ptr %actor) { + tail call swiftcc void %fnPtr(ptr %async.ctxt, ptr %task, ptr %actor) ret void } @@ -38,7 +38,7 @@ entry: i32 128 ; Initial async context size without space for frame }> -define swifttailcc void @unreachable(ptr %async.ctxt, ptr %task, ptr %actor) { +define swiftcc void @unreachable(ptr %async.ctxt, ptr %task, ptr %actor) { entry: %tmp = alloca { i64, i64 }, align 8 %proj.1 = getelementptr inbounds { i64, i64 }, ptr %tmp, i64 0, i32 0 @@ -77,11 +77,11 @@ entry: unreachable } -; CHECK: define swifttailcc void @unreachable +; CHECK: define swiftcc void @unreachable ; CHECK-NOT: @llvm.coro.suspend.async ; CHECK: return -; CHECK: define internal swifttailcc void @unreachable.resume.0 +; CHECK: define internal swiftcc void @unreachable.resume.0 ; CHECK: unreachable declare ptr @llvm.coro.prepare.async(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-async.ll b/llvm/test/Transforms/Coroutines/coro-async.ll index 8ead304fe2988..3740c3d1d8387 100644 --- a/llvm/test/Transforms/Coroutines/coro-async.ll +++ b/llvm/test/Transforms/Coroutines/coro-async.ll @@ -37,28 +37,28 @@ declare void @my_other_async_function(ptr %async.ctxt) }> ; Function that implements the dispatch to the callee function. -define swifttailcc void @my_async_function.my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt, ptr %task, ptr %actor) alwaysinline { - musttail call swifttailcc void %fnPtr(ptr %async.ctxt, ptr %task, ptr %actor) +define swiftcc void @my_async_function.my_other_async_function_fp.apply(ptr %fnPtr, ptr %async.ctxt, ptr %task, ptr %actor) { + tail call swiftcc void %fnPtr(ptr %async.ctxt, ptr %task, ptr %actor) ret void } declare void @some_user(i64) declare void @some_may_write(ptr) -define ptr @__swift_async_resume_project_context(ptr %ctxt) alwaysinline { +define ptr @__swift_async_resume_project_context(ptr %ctxt) { entry: %resume_ctxt = load ptr, ptr %ctxt, align 8 ret ptr %resume_ctxt } -define ptr @resume_context_projection(ptr %ctxt) alwaysinline { +define ptr @resume_context_projection(ptr %ctxt) { entry: %resume_ctxt = load ptr, ptr %ctxt, align 8 ret ptr %resume_ctxt } -define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) presplitcoroutine !dbg !1 { +define swiftcc void @my_async_function(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) presplitcoroutine !dbg !1 { entry: %tmp = alloca { i64, i64 }, align 8 %vector = alloca <4 x double>, align 16 @@ -100,7 +100,7 @@ entry: %val.2 = load i64, ptr %proj.2 call void @some_user(i64 %val.2) store <4 x double> %vector_spill, ptr %vector, align 16 - tail call swifttailcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) + tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -116,8 +116,8 @@ define void @my_async_function_pa(ptr %ctxt, ptr %task, ptr %actor) { ; CHECK: @my_async_function_pa_fp = constant <{ i32, i32 }> <{ {{.*}}, i32 176 } ; CHECK: @my_async_function2_fp = constant <{ i32, i32 }> <{ {{.*}}, i32 176 } -; CHECK-LABEL: define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) -; CHECK-O0-LABEL: define swifttailcc void @my_async_function(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) +; CHECK-LABEL: define swiftcc void @my_async_function(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) +; CHECK-O0-LABEL: define swiftcc void @my_async_function(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) ; CHECK-SAME: !dbg ![[SP1:[0-9]+]] { ; CHECK: coro.return: ; CHECK: [[FRAMEPTR:%.*]] = getelementptr inbounds i8, ptr %async.ctxt, i64 128 @@ -139,12 +139,12 @@ define void @my_async_function_pa(ptr %ctxt, ptr %task, ptr %actor) { ; CHECK-O0: [[VECTOR_SPILL:%.*]] = load <4 x double>, ptr {{.*}} ; CHECK-O0: [[VECTOR_SPILL_ADDR:%.*]] = getelementptr inbounds %my_async_function.Frame, ptr {{.*}}, i32 0, i32 1 ; CHECK-O0: store <4 x double> [[VECTOR_SPILL]], ptr [[VECTOR_SPILL_ADDR]], align 16 -; CHECK: tail call swifttailcc void @asyncSuspend(ptr nonnull [[CALLEE_CTXT]], ptr %task, ptr %actor) +; CHECK: tail call swiftcc void @asyncSuspend(ptr nonnull [[CALLEE_CTXT]], ptr %task, ptr %actor) ; CHECK: ret void ; CHECK: } -; CHECK-LABEL: define internal swifttailcc void @my_async_functionTQ0_(ptr nocapture readonly swiftasync %0, ptr %1, ptr nocapture readnone %2) -; CHECK-O0-LABEL: define internal swifttailcc void @my_async_functionTQ0_(ptr swiftasync %0, ptr %1, ptr %2) +; CHECK-LABEL: define internal swiftcc void @my_async_functionTQ0_(ptr nocapture readonly swiftasync %0, ptr %1, ptr nocapture readnone %2) +; CHECK-O0-LABEL: define internal swiftcc void @my_async_functionTQ0_(ptr swiftasync %0, ptr %1, ptr %2) ; CHECK-SAME: !dbg ![[SP2:[0-9]+]] { ; CHECK: entryresume.0: ; CHECK: [[CALLER_CONTEXT:%.*]] = load ptr, ptr %0 @@ -163,7 +163,7 @@ define void @my_async_function_pa(ptr %ctxt, ptr %task, ptr %actor) { ; CHECK: tail call void @some_user(i64 [[VAL1]]) ; CHECK: [[VAL2:%.*]] = load i64, ptr [[ALLOCA_PRJ2]] ; CHECK: tail call void @some_user(i64 [[VAL2]]) -; CHECK: tail call swifttailcc void @asyncReturn(ptr [[ASYNC_CTXT_RELOAD]], ptr %1, ptr [[ACTOR_RELOAD]]) +; CHECK: tail call swiftcc void @asyncReturn(ptr [[ASYNC_CTXT_RELOAD]], ptr %1, ptr [[ACTOR_RELOAD]]) ; CHECK: ret void ; CHECK: } @@ -177,7 +177,7 @@ define void @my_async_function_pa(ptr %ctxt, ptr %task, ptr %actor) { i32 128 ; Initial async context size without space for frame }> -define swifttailcc void @my_async_function2(ptr %task, ptr %actor, ptr %async.ctxt) presplitcoroutine "frame-pointer"="all" !dbg !6 { +define swiftcc void @my_async_function2(ptr %task, ptr %actor, ptr %async.ctxt) presplitcoroutine "frame-pointer"="all" !dbg !6 { entry: %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 2, ptr @my_async_function2_fp) @@ -210,12 +210,12 @@ entry: call void @llvm.coro.async.context.dealloc(ptr %callee_context) %continuation_actor_arg = extractvalue {ptr, ptr, ptr} %res.2, 1 - tail call swifttailcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %continuation_actor_arg) + tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %continuation_actor_arg) call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } -; CHECK-LABEL: define swifttailcc void @my_async_function2(ptr %task, ptr %actor, ptr %async.ctxt) +; CHECK-LABEL: define swiftcc void @my_async_function2(ptr %task, ptr %actor, ptr %async.ctxt) ; CHECK-SAME: #[[FRAMEPOINTER:[0-9]+]] ; CHECK-SAME: !dbg ![[SP3:[0-9]+]] ; CHECK: store ptr %async.ctxt, @@ -225,22 +225,22 @@ entry: ; CHECK: store ptr [[CALLEE_CTXT]], ; CHECK: store ptr @my_async_function2.resume.0, ; CHECK: store ptr %async.ctxt, -; CHECK: tail call swifttailcc void @asyncSuspend(ptr nonnull [[CALLEE_CTXT]], ptr %task, ptr %actor) +; CHECK: tail call swiftcc void @asyncSuspend(ptr nonnull [[CALLEE_CTXT]], ptr %task, ptr %actor) ; CHECK: ret void -; CHECK-LABEL: define internal swifttailcc void @my_async_function2.resume.0(ptr %0, ptr nocapture readnone %1, ptr nocapture readonly %2) +; CHECK-LABEL: define internal swiftcc void @my_async_function2.resume.0(ptr %0, ptr nocapture readnone %1, ptr nocapture readonly %2) ; CHECK-SAME: #[[FRAMEPOINTER]] ; CHECK-SAME: !dbg ![[SP4:[0-9]+]] ; CHECK: [[CALLEE_CTXT:%.*]] = load ptr, ptr %2 ; CHECK: [[CALLEE_CTXT_SPILL_ADDR:%.*]] = getelementptr inbounds i8, ptr [[CALLEE_CTXT]], i64 152 ; CHECK: store ptr @my_async_function2.resume.1, ; CHECK: [[CALLLE_CTXT_RELOAD:%.*]] = load ptr, ptr [[CALLEE_CTXT_SPILL_ADDR]] -; CHECK: tail call swifttailcc void @asyncSuspend(ptr [[CALLEE_CTXT_RELOAD]] +; CHECK: tail call swiftcc void @asyncSuspend(ptr [[CALLEE_CTXT_RELOAD]] ; CHECK: ret void -; CHECK-LABEL: define internal swifttailcc void @my_async_function2.resume.1(ptr nocapture readonly %0, ptr %1, ptr nocapture readnone %2) +; CHECK-LABEL: define internal swiftcc void @my_async_function2.resume.1(ptr nocapture readonly %0, ptr %1, ptr nocapture readnone %2) ; CHECK-SAME: #[[FRAMEPOINTER]] -; CHECK: tail call swifttailcc void @asyncReturn({{.*}}%1) +; CHECK: tail call swiftcc void @asyncReturn({{.*}}%1) ; CHECK: ret void define swiftcc void @top_level_caller(ptr %ctxt, ptr %task, ptr %actor) { @@ -252,7 +252,7 @@ define swiftcc void @top_level_caller(ptr %ctxt, ptr %task, ptr %actor) { ; CHECK-LABEL: define swiftcc void @top_level_caller(ptr %ctxt, ptr %task, ptr %actor) ; CHECK: store ptr @my_async_functionTQ0_ ; CHECK: store ptr %ctxt -; CHECK: tail call swifttailcc void @asyncSuspend +; CHECK: tail call swiftcc void @asyncSuspend ; CHECK: ret void @dont_crash_on_cf_fp = constant <{ i32, i32 }> @@ -266,7 +266,7 @@ define swiftcc void @top_level_caller(ptr %ctxt, ptr %task, ptr %actor) { }> -define swifttailcc void @dont_crash_on_cf_dispatch(ptr %fnPtr, ptr %async.ctxt, ptr %task, ptr %actor) alwaysinline { +define swiftcc void @dont_crash_on_cf_dispatch(ptr %fnPtr, ptr %async.ctxt, ptr %task, ptr %actor) { %isNull = icmp eq ptr %task, null br i1 %isNull, label %is_null, label %is_not_null @@ -274,11 +274,11 @@ is_null: ret void is_not_null: - musttail call swifttailcc void %fnPtr(ptr %async.ctxt, ptr %task, ptr %actor) + tail call swiftcc void %fnPtr(ptr %async.ctxt, ptr %task, ptr %actor) ret void } -define swifttailcc void @dont_crash_on_cf(ptr %async.ctxt, ptr %task, ptr %actor) presplitcoroutine { +define swiftcc void @dont_crash_on_cf(ptr %async.ctxt, ptr %task, ptr %actor) presplitcoroutine { entry: %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 0, ptr @dont_crash_on_cf_fp) @@ -296,7 +296,7 @@ entry: call void @llvm.coro.async.context.dealloc(ptr %callee_context) %continuation_task_arg = extractvalue {ptr, ptr, ptr} %res, 1 - tail call swifttailcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) + tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -311,12 +311,12 @@ entry: i32 128 ; Initial async context size without space for frame }> -define swifttailcc void @must_tail_call_return(ptr %async.ctxt, ptr %task, ptr %actor) alwaysinline { - musttail call swifttailcc void @asyncReturn(ptr %async.ctxt, ptr %task, ptr %actor) +define swiftcc void @must_tail_call_return(ptr %async.ctxt, ptr %task, ptr %actor) { + musttail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %task, ptr %actor) ret void } -define swifttailcc void @multiple_coro_end_async(ptr %async.ctxt, ptr %task, ptr %actor) presplitcoroutine { +define swiftcc void @multiple_coro_end_async(ptr %async.ctxt, ptr %task, ptr %actor) presplitcoroutine { entry: %id = call token @llvm.coro.id.async(i32 128, i32 16, i32 0, ptr @dont_crash_on_cf_fp) @@ -350,8 +350,8 @@ is_not_equal: unreachable } -; CHECK-LABEL: define internal swifttailcc void @multiple_coro_end_async.resume.0( -; CHECK: musttail call swifttailcc void @asyncReturn( +; CHECK-LABEL: define internal swiftcc void @multiple_coro_end_async.resume.0( +; CHECK: musttail call swiftcc void @asyncReturn( ; CHECK: ret void @polymorphic_suspend_return_fp = constant <{ i32, i32 }> @@ -364,7 +364,7 @@ is_not_equal: i32 64 ; Initial async context size without space for frame }> -define swifttailcc void @polymorphic_suspend_return(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) presplitcoroutine { +define swiftcc void @polymorphic_suspend_return(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) presplitcoroutine { entry: %tmp = alloca { i64, i64 }, align 8 %proj.1 = getelementptr inbounds { i64, i64 }, ptr %tmp, i64 0, i32 0 @@ -405,13 +405,13 @@ entry: %val.2 = load i64, ptr %proj.2 call void @some_user(i64 %val.2) - tail call swifttailcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) + tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } -; CHECK-LABEL: define swifttailcc void @polymorphic_suspend_return(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) -; CHECK-LABEL: define internal swifttailcc void @polymorphic_suspend_return.resume.0(ptr {{.*}}swiftasync{{.*}} %0, ptr {{.*}}swiftself{{.*}} %1, ptr {{.*}}%2, ptr {{.*}}%3) +; CHECK-LABEL: define swiftcc void @polymorphic_suspend_return(ptr swiftasync %async.ctxt, ptr %task, ptr %actor) +; CHECK-LABEL: define internal swiftcc void @polymorphic_suspend_return.resume.0(ptr {{.*}}swiftasync{{.*}} %0, ptr {{.*}}swiftself{{.*}} %1, ptr {{.*}}%2, ptr {{.*}}%3) ; CHECK: } @no_coro_suspend_fp = constant <{ i32, i32 }> @@ -481,7 +481,7 @@ entry: declare void @crash() declare void @use(ptr) -define swifttailcc void @undefined_coro_async_resume(ptr %async.ctx) presplitcoroutine { +define swiftcc void @undefined_coro_async_resume(ptr %async.ctx) presplitcoroutine { entry: %id = call token @llvm.coro.id.async(i32 24, i32 16, i32 0, ptr @undefined_coro_async_resume_fp) %hdl = call ptr @llvm.coro.begin(token %id, ptr null) @@ -491,7 +491,7 @@ entry: %unused = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 false) unreachable } -; CHECK-LABEL: define swifttailcc void @undefined_coro_async_resume +; CHECK-LABEL: define swiftcc void @undefined_coro_async_resume ; CHECK-NOT: @llvm.coro.async.resume ; CHECK: call void @use(ptr null) ; CHECK: ret @@ -505,8 +505,8 @@ declare i1 @llvm.coro.end(ptr, i1, token) declare {ptr, ptr, ptr} @llvm.coro.suspend.async(i32, ptr, ptr, ...) declare ptr @llvm.coro.async.context.alloc(ptr, ptr) declare void @llvm.coro.async.context.dealloc(ptr) -declare swifttailcc void @asyncReturn(ptr, ptr, ptr) -declare swifttailcc void @asyncSuspend(ptr, ptr, ptr) +declare swiftcc void @asyncReturn(ptr, ptr, ptr) +declare swiftcc void @asyncSuspend(ptr, ptr, ptr) declare ptr @llvm.coro.async.resume() declare void @llvm.coro.async.size.replace(ptr, ptr) declare ptr @hide(ptr) diff --git a/llvm/test/Transforms/Coroutines/swift-async-dbg.ll b/llvm/test/Transforms/Coroutines/swift-async-dbg.ll index a78bcdf0ddee2..74edf7a3f3a54 100644 --- a/llvm/test/Transforms/Coroutines/swift-async-dbg.ll +++ b/llvm/test/Transforms/Coroutines/swift-async-dbg.ll @@ -1,13 +1,13 @@ -; RUN: opt -mtriple='arm64-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s -; RUN: opt -mtriple='x86_64' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s -; RUN: opt -mtriple='i386-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s --check-prefix=NOENTRY -; RUN: opt -mtriple='armv7-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s --check-prefix=NOENTRY +; RUN: opt -mtriple='arm64-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s +; RUN: opt -mtriple='x86_64' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s +; RUN: opt -mtriple='i386-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s --check-prefix=NOENTRY +; RUN: opt -mtriple='armv7-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s --check-prefix=NOENTRY ;; Replicate those tests with non-instruction debug markers. -; RUN: opt --try-experimental-debuginfo-iterators -mtriple='arm64-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s -; RUN: opt --try-experimental-debuginfo-iterators -mtriple='x86_64' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s -; RUN: opt --try-experimental-debuginfo-iterators -mtriple='i386-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s --check-prefix=NOENTRY -; RUN: opt --try-experimental-debuginfo-iterators -mtriple='armv7-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg),always-inline' -o - | FileCheck %s --check-prefix=NOENTRY +; RUN: opt --try-experimental-debuginfo-iterators -mtriple='arm64-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s +; RUN: opt --try-experimental-debuginfo-iterators -mtriple='x86_64' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s +; RUN: opt --try-experimental-debuginfo-iterators -mtriple='i386-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s --check-prefix=NOENTRY +; RUN: opt --try-experimental-debuginfo-iterators -mtriple='armv7-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s --check-prefix=NOENTRY ; NOENTRY-NOT: OP_llvm_entry_value @@ -93,29 +93,29 @@ define swifttailcc void @coroutineA(ptr swiftasync %arg) !dbg !48 { @coroutineBTu = global <{i32, i32}> <{ i32 trunc (i64 sub (i64 ptrtoint (ptr @"coroutineB" to i64), i64 ptrtoint (ptr @"coroutineBTu" to i64)) to i32), i32 16 }>, align 8 @coroutineATu = global <{i32, i32}> <{ i32 trunc (i64 sub (i64 ptrtoint (ptr @"coroutineA" to i64), i64 ptrtoint (ptr @"coroutineATu" to i64)) to i32), i32 16 }>, align 8 -define weak_odr hidden ptr @__swift_async_resume_get_context(ptr %arg) alwaysinline !dbg !64 { +define weak_odr hidden ptr @__swift_async_resume_get_context(ptr %arg) !dbg !64 { ret ptr %arg, !dbg !65 } -define hidden swifttailcc void @coroutineA.1(ptr %arg, i64 %arg1, i64 %arg2, ptr %arg3) alwaysinline !dbg !66 { +define hidden swifttailcc void @coroutineA.1(ptr %arg, i64 %arg1, i64 %arg2, ptr %arg3) !dbg !66 { musttail call swifttailcc void @swift_task_switch(ptr swiftasync %arg3, ptr %arg, i64 %arg1, i64 %arg2), !dbg !67 ret void, !dbg !67 } -define weak_odr hidden ptr @__swift_async_resume_project_context(ptr %arg) alwaysinline !dbg !68 { +define weak_odr hidden ptr @__swift_async_resume_project_context(ptr %arg) !dbg !68 { %i1 = load ptr, ptr %arg, align 8, !dbg !69 %i2 = call ptr @llvm.swift.async.context.addr(), !dbg !69 store ptr %i1, ptr %i2, align 8, !dbg !69 ret ptr %i1, !dbg !69 } -define hidden swifttailcc void @coroutineA.0(ptr %arg, ptr %arg1) alwaysinline !dbg !70 { +define hidden swifttailcc void @coroutineA.0(ptr %arg, ptr %arg1) !dbg !70 { musttail call swifttailcc void %arg(ptr swiftasync %arg1), !dbg !71 ret void, !dbg !71 } -define hidden swifttailcc void @coroutineA.0.1(ptr %arg, ptr %arg1) alwaysinline !dbg !72 { +define hidden swifttailcc void @coroutineA.0.1(ptr %arg, ptr %arg1) !dbg !72 { musttail call swifttailcc void %arg(ptr swiftasync %arg1), !dbg !73 ret void, !dbg !73 } -define swifttailcc void @coroutineB(ptr swiftasync %arg) alwaysinline !dbg !37 { +define swifttailcc void @coroutineB(ptr swiftasync %arg) !dbg !37 { %i2 = call token @llvm.coro.id.async(i32 16, i32 16, i32 0, ptr nonnull @coroutineBTu) %i3 = call ptr @llvm.coro.begin(token %i2, ptr null) %i6 = getelementptr inbounds <{ ptr, ptr }>, ptr %arg, i64 0, i32 1, !dbg !42 @@ -123,7 +123,7 @@ define swifttailcc void @coroutineB(ptr swiftasync %arg) alwaysinline !dbg !37 { %i10 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %i3, i1 false, ptr nonnull @coroutineB.0, ptr %i712, ptr %arg), !dbg !42 unreachable, !dbg !42 } -define hidden swifttailcc void @coroutineB.0(ptr %arg, ptr %arg1) alwaysinline !dbg !44 { +define hidden swifttailcc void @coroutineB.0(ptr %arg, ptr %arg1) !dbg !44 { musttail call swifttailcc void %arg(ptr swiftasync %arg1), !dbg !47 ret void, !dbg !47 } From 162fa4dd25d631d0ab7816ec6081bcaff951a23c Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Wed, 21 Feb 2024 16:40:22 +0000 Subject: [PATCH 122/351] Module::getOrInsertFunction: set debug info format (#82505) Function::Function's constructor sets the debug info format based on the passed in parent Module, so by using this rather than modifying the function list directly, we pick up the debug info format automatically. --- llvm/lib/IR/Module.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index eeb90a6cb3c46..1946db2ee0be7 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -149,10 +149,9 @@ FunctionCallee Module::getOrInsertFunction(StringRef Name, FunctionType *Ty, if (!F) { // Nope, add it Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, - DL.getProgramAddressSpace(), Name); + DL.getProgramAddressSpace(), Name, this); if (!New->isIntrinsic()) // Intrinsics get attrs set on construction New->setAttributes(AttributeList); - FunctionList.push_back(New); return {Ty, New}; // Return the new prototype. } From 71441ed1716e6ed3f053dea9c1ceb9cfe2822aea Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Wed, 21 Feb 2024 09:22:48 -0800 Subject: [PATCH 123/351] [mlir][Vector] Add vector bitwidth target to xfer op flattening (#81966) This PR adds an optional bitwidth parameter to the vector xfer op flattening transformation so that the flattening doesn't happen if the trailing dimension of the read/writen vector is larger than this bitwidth (i.e., we are already able to fill at least one vector register with that size). --- .../Vector/Transforms/VectorRewritePatterns.h | 9 ++- .../Transforms/VectorTransferOpTransforms.cpp | 45 +++++++++-- .../Vector/vector-transfer-flatten.mlir | 80 +++++++++++++++++-- .../Dialect/Vector/TestVectorTransforms.cpp | 16 +++- 4 files changed, 137 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h index 7c943f07066c7..46bb3ddec0baf 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h @@ -330,8 +330,13 @@ void populateDropUnitDimWithShapeCastPatterns(RewritePatternSet &patterns, /// These patterns insert memref.collapse_shape + vector.shape_cast patterns /// to transform multiple small n-D transfers into a larger 1-D transfer where /// the memref contiguity properties allow it. -void populateFlattenVectorTransferPatterns(RewritePatternSet &patterns, - PatternBenefit benefit = 1); +/// +/// Flattening is only applied if the bitwidth of the trailing vector dimension +/// is smaller or equal to `targetVectorBitwidth`. +void populateFlattenVectorTransferPatterns( + RewritePatternSet &patterns, + unsigned targetVectorBitwidth = std::numeric_limits::max(), + PatternBenefit benefit = 1); /// Collect a set of patterns that bubble up/down bitcast ops. /// diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp index b761d1ed88897..04e5a816dd91e 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp @@ -19,7 +19,6 @@ #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" -#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/ADT/STLExtras.h" @@ -535,9 +534,17 @@ namespace { /// memref.collapse_shape on the source so that the resulting /// vector.transfer_read has a 1D source. Requires the source shape to be /// already reduced i.e. without unit dims. +/// If `targetVectorBitwidth` is provided, the flattening will only happen if +/// the trailing dimension of the vector read is smaller than the provided +/// bitwidth. class FlattenContiguousRowMajorTransferReadPattern : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +public: + FlattenContiguousRowMajorTransferReadPattern(MLIRContext *context, + unsigned vectorBitwidth, + PatternBenefit benefit) + : OpRewritePattern(context, benefit), + targetVectorBitwidth(vectorBitwidth) {} LogicalResult matchAndRewrite(vector::TransferReadOp transferReadOp, PatternRewriter &rewriter) const override { @@ -554,6 +561,12 @@ class FlattenContiguousRowMajorTransferReadPattern // If this is already 0D/1D, there's nothing to do. if (vectorType.getRank() <= 1) return failure(); + if (!vectorType.getElementType().isSignlessIntOrFloat()) + return failure(); + unsigned trailingVectorDimBitwidth = + vectorType.getShape().back() * vectorType.getElementTypeBitWidth(); + if (trailingVectorDimBitwidth >= targetVectorBitwidth) + return failure(); if (!vector::isContiguousSlice(sourceType, vectorType)) return failure(); // TODO: generalize this pattern, relax the requirements here. @@ -642,6 +655,11 @@ class FlattenContiguousRowMajorTransferReadPattern transferReadOp, cast(vector.getType()), flatRead); return success(); } + +private: + // Minimum bitwidth that the trailing vector dimension should have after + // flattening. + unsigned targetVectorBitwidth; }; /// Rewrites contiguous row-major vector.transfer_write ops by inserting @@ -650,7 +668,12 @@ class FlattenContiguousRowMajorTransferReadPattern /// already reduced i.e. without unit dims. class FlattenContiguousRowMajorTransferWritePattern : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +public: + FlattenContiguousRowMajorTransferWritePattern(MLIRContext *context, + unsigned vectorBitwidth, + PatternBenefit benefit) + : OpRewritePattern(context, benefit), + targetVectorBitwidth(vectorBitwidth) {} LogicalResult matchAndRewrite(vector::TransferWriteOp transferWriteOp, PatternRewriter &rewriter) const override { @@ -665,6 +688,12 @@ class FlattenContiguousRowMajorTransferWritePattern if (vectorType.getRank() <= 1) // Already 0D/1D, nothing to do. return failure(); + if (!vectorType.getElementType().isSignlessIntOrFloat()) + return failure(); + unsigned trailingVectorDimBitwidth = + vectorType.getShape().back() * vectorType.getElementTypeBitWidth(); + if (trailingVectorDimBitwidth >= targetVectorBitwidth) + return failure(); if (!vector::isContiguousSlice(sourceType, vectorType)) return failure(); int64_t firstContiguousInnerDim = @@ -702,6 +731,11 @@ class FlattenContiguousRowMajorTransferWritePattern rewriter.eraseOp(transferWriteOp); return success(); } + +private: + // Minimum bitwidth that the trailing vector dimension should have after + // flattening. + unsigned targetVectorBitwidth; }; /// Base class for `vector.extract/vector.extract_element(vector.transfer_read)` @@ -917,10 +951,11 @@ void mlir::vector::populateVectorTransferDropUnitDimsPatterns( } void mlir::vector::populateFlattenVectorTransferPatterns( - RewritePatternSet &patterns, PatternBenefit benefit) { + RewritePatternSet &patterns, unsigned targetVectorBitwidth, + PatternBenefit benefit) { patterns.add( - patterns.getContext(), benefit); + patterns.getContext(), targetVectorBitwidth, benefit); populateShapeCastFoldingPatterns(patterns, benefit); populateDropUnitDimWithShapeCastPatterns(patterns, benefit); } diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir index 9976048a3320b..1775b5fa4a346 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -test-vector-transfer-flatten-patterns -split-input-file | FileCheck %s +// RUN: mlir-opt %s -test-vector-transfer-flatten-patterns=target-vector-bitwidth=128 -split-input-file | FileCheck %s --check-prefix=CHECK-128B func.func @transfer_read_dims_match_contiguous( %arg : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<5x4x3x2xi8> { @@ -16,6 +17,9 @@ func.func @transfer_read_dims_match_contiguous( // CHECK: %[[VEC2D:.+]] = vector.shape_cast %[[READ1D]] : vector<120xi8> to vector<5x4x3x2xi8> // CHECK: return %[[VEC2D]] +// CHECK-128B-LABEL: func @transfer_read_dims_match_contiguous +// CHECK-128B: memref.collapse_shape + // ----- func.func @transfer_read_dims_match_contiguous_empty_stride( @@ -27,13 +31,16 @@ func.func @transfer_read_dims_match_contiguous_empty_stride( return %v : vector<5x4x3x2xi8> } -// CHECK-LABEL: func @transfer_read_dims_match_contiguous_empty_stride +// CHECK-LABEL: func @transfer_read_dims_match_contiguous_empty_stride( // CHECK-SAME: %[[ARG:[0-9a-zA-Z]+]]: memref<5x4x3x2xi8 // CHECK: %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG]] {{.}}[0, 1, 2, 3] // CHECK: %[[READ1D:.+]] = vector.transfer_read %[[COLLAPSED]] // CHECK: %[[VEC2D:.+]] = vector.shape_cast %[[READ1D]] : vector<120xi8> to vector<5x4x3x2xi8> // CHECK: return %[[VEC2D]] +// CHECK-128B-LABEL: func @transfer_read_dims_match_contiguous_empty_stride( +// CHECK-128B: memref.collapse_shape + // ----- // The shape of the memref and the vector don't match, but the vector is a @@ -57,6 +64,9 @@ func.func @transfer_read_dims_mismatch_contiguous( // CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<4xi8> to vector<1x1x2x2xi8> // CHECK: return %[[VAL_5]] : vector<1x1x2x2xi8> +// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_contiguous( +// CHECK-128B: memref.collapse_shape + // ----- func.func @transfer_read_dims_mismatch_non_zero_indices( @@ -66,7 +76,7 @@ func.func @transfer_read_dims_mismatch_non_zero_indices( %m_out: memref<1x2x6xi32>) { %c0 = arith.constant 0 : index %c0_i32 = arith.constant 0 : i32 - %2 = vector.transfer_read %m_in[%c0, %idx_1, %idx_2, %c0], %c0_i32 {in_bounds = [true, true, true]} : + %2 = vector.transfer_read %m_in[%c0, %idx_1, %idx_2, %c0], %c0_i32 {in_bounds = [true, true, true]} : memref<1x43x4x6xi32>, vector<1x2x6xi32> vector.transfer_write %2, %m_out[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x6xi32>, memref<1x2x6xi32> @@ -87,6 +97,9 @@ func.func @transfer_read_dims_mismatch_non_zero_indices( // CHECK: %[[COLLAPSED_OUT:.*]] = memref.collapse_shape %[[M_OUT]] {{\[}}[0, 1, 2]] : memref<1x2x6xi32> into memref<12xi32> // CHECK: vector.transfer_write %[[READ]], %[[COLLAPSED_OUT]][%[[C_0_IDX]]] {in_bounds = [true]} : vector<12xi32>, memref<12xi32> +// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_zero_indices( +// CHECK-128B-NOT: memref.collapse_shape + // ----- // The input memref has a dynamic trailing shape and hence is not flattened. @@ -99,7 +112,7 @@ func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes( %m_out: memref<1x2x6xi32>) { %c0 = arith.constant 0 : index %c0_i32 = arith.constant 0 : i32 - %2 = vector.transfer_read %m_in[%c0, %idx_1, %idx_2, %c0], %c0_i32 {in_bounds = [true, true, true]} : + %2 = vector.transfer_read %m_in[%c0, %idx_1, %idx_2, %c0], %c0_i32 {in_bounds = [true, true, true]} : memref<1x?x4x6xi32>, vector<1x2x6xi32> vector.transfer_write %2, %m_out[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x6xi32>, memref<1x2x6xi32> @@ -115,6 +128,9 @@ func.func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes( // CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<1x2x6xi32> to vector<12xi32> // CHECK: vector.transfer_write %[[SC]], %[[COLLAPSED]]{{.*}} : vector<12xi32>, memref<12xi32> +// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_zero_indices_dynamic_shapes( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @transfer_read_dims_mismatch_non_contiguous( @@ -130,6 +146,9 @@ func.func @transfer_read_dims_mismatch_non_contiguous( // CHECK-NOT: memref.collapse_shape // CHECK-NOT: vector.shape_cast +// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_contiguous( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @transfer_read_dims_mismatch_non_contiguous_empty_stride( @@ -141,10 +160,13 @@ func.func @transfer_read_dims_mismatch_non_contiguous_empty_stride( return %v : vector<2x1x2x2xi8> } -// CHECK-LABEL: func.func @transfer_read_dims_mismatch_non_contiguous_empty_stride +// CHECK-LABEL: func.func @transfer_read_dims_mismatch_non_contiguous_empty_stride( // CHECK-NOT: memref.collapse_shape // CHECK-NOT: vector.shape_cast +// CHECK-128B-LABEL: func @transfer_read_dims_mismatch_non_contiguous_empty_stride( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @transfer_write_dims_match_contiguous( @@ -155,13 +177,16 @@ func.func @transfer_write_dims_match_contiguous( return } -// CHECK-LABEL: func @transfer_write_dims_match_contiguous +// CHECK-LABEL: func @transfer_write_dims_match_contiguous( // CHECK-SAME: %[[ARG:[0-9a-zA-Z]+]]: memref<5x4x3x2xi8 // CHECK-SAME: %[[VEC:[0-9a-zA-Z]+]]: vector<5x4x3x2xi8> // CHECK-DAG: %[[COLLAPSED:.+]] = memref.collapse_shape %[[ARG]] {{.}}[0, 1, 2, 3]{{.}} : memref<5x4x3x2xi8, {{.+}}> into memref<120xi8, {{.+}}> // CHECK-DAG: %[[VEC1D:.+]] = vector.shape_cast %[[VEC]] : vector<5x4x3x2xi8> to vector<120xi8> // CHECK: vector.transfer_write %[[VEC1D]], %[[COLLAPSED]] +// CHECK-128B-LABEL: func @transfer_write_dims_match_contiguous( +// CHECK-128B: memref.collapse_shape + // ----- func.func @transfer_write_dims_mismatch_contiguous( @@ -182,6 +207,9 @@ func.func @transfer_write_dims_mismatch_contiguous( // CHECK: return // CHECK: } +// CHECK-128B-LABEL: func @transfer_write_dims_mismatch_contiguous( +// CHECK-128B: memref.collapse_shape + // ----- func.func @transfer_write_dims_mismatch_non_contiguous( @@ -196,6 +224,9 @@ func.func @transfer_write_dims_mismatch_non_contiguous( // CHECK-NOT: memref.collapse_shape // CHECK-NOT: vector.shape_cast +// CHECK-128B-LABEL: func @transfer_write_dims_mismatch_non_contiguous( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @transfer_write_0d(%arg : memref, %vec : vector) { @@ -207,6 +238,10 @@ func.func @transfer_write_0d(%arg : memref, %vec : vector) { // CHECK-NOT: memref.collapse_shape // CHECK-NOT: vector.shape_cast +// CHECK-128B-LABEL: func @transfer_write_0d( +// CHECK-128B-NOT: memref.collapse_shape +// CHECK-128B-NOT: vector.shape_cast + // ----- func.func @transfer_read_0d(%arg : memref) -> vector { @@ -219,6 +254,10 @@ func.func @transfer_read_0d(%arg : memref) -> vector { // CHECK-NOT: memref.collapse_shape // CHECK-NOT: vector.shape_cast +// CHECK-128B-LABEL: func @transfer_read_0d( +// CHECK-128B-NOT: memref.collapse_shape +// CHECK-128B-NOT: vector.shape_cast + // ----- func.func @transfer_read_flattenable_with_dynamic_dims_and_indices(%arg0 : memref>, %arg1 : index, %arg2 : index) -> vector<8x4xi8> { @@ -241,6 +280,9 @@ func.func @transfer_read_flattenable_with_dynamic_dims_and_indices(%arg0 : memre // CHECK: %[[VEC2D:.+]] = vector.shape_cast %[[VEC1D]] : vector<32xi8> to vector<8x4xi8> // CHECK: return %[[VEC2D]] : vector<8x4xi8> +// CHECK-128B-LABEL: func @transfer_read_flattenable_with_dynamic_dims_and_indices( +// CHECK-128B: memref.collapse_shape + // ----- func.func @transfer_write_flattenable_with_dynamic_dims_and_indices(%vec : vector<8x4xi8>, %dst : memref>, %arg1 : index, %arg2 : index) { @@ -260,6 +302,9 @@ func.func @transfer_write_flattenable_with_dynamic_dims_and_indices(%vec : vecto // CHECK-SAME: {in_bounds = [true]} // CHECK-SAME: : vector<32xi8>, memref +// CHECK-128B-LABEL: func @transfer_write_flattenable_with_dynamic_dims_and_indices( +// CHECK-128B: memref.collapse_shape + // ----- func.func @transfer_read_flattenable_negative( @@ -274,6 +319,9 @@ func.func @transfer_read_flattenable_negative( // CHECK-LABEL: func @transfer_read_flattenable_negative // CHECK: vector.transfer_read {{.*}} vector<2x2x2x2xi8> +// CHECK-128B-LABEL: func @transfer_read_flattenable_negative( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @transfer_read_flattenable_negative2( @@ -288,6 +336,9 @@ func.func @transfer_read_flattenable_negative2( // CHECK-LABEL: func @transfer_read_flattenable_negative2 // CHECK: vector.transfer_read {{.*}} vector<5x4x3x2xi8> +// CHECK-128B-LABEL: func @transfer_read_flattenable_negative2( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @fold_unit_dim_add_basic(%arg0 : vector<1x8xi32>) -> vector<1x8xi32> { @@ -302,6 +353,9 @@ func.func @fold_unit_dim_add_basic(%arg0 : vector<1x8xi32>) -> vector<1x8xi32> { // CHECK: %[[VAL_4:.*]] = vector.shape_cast %[[VAL_3]] : vector<8xi32> to vector<1x8xi32> // CHECK: return %[[VAL_4]] : vector<1x8xi32> +// CHECK-128B-LABEL: func @fold_unit_dim_add_basic( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @fold_unit_dim_add_leading_and_trailing(%arg0 : vector<1x8x1xi32>) -> vector<1x8x1xi32> { @@ -316,6 +370,9 @@ func.func @fold_unit_dim_add_leading_and_trailing(%arg0 : vector<1x8x1xi32>) -> // CHECK: %[[VAL_4:.*]] = vector.shape_cast %[[VAL_3]] : vector<8xi32> to vector<1x8x1xi32> // CHECK: return %[[VAL_4]] : vector<1x8x1xi32> +// CHECK-128B-LABEL: func @fold_unit_dim_add_leading_and_trailing( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @fold_unit_dim_add(%arg0 : vector<8x1xi32>, @@ -334,6 +391,9 @@ func.func @fold_unit_dim_add(%arg0 : vector<8x1xi32>, // CHECK: %[[VAL_4:.*]] = arith.addi %[[VAL_2]], %[[VAL_3]] : vector<8xi32> // CHECK: return %[[VAL_4]] : vector<8xi32> +// CHECK-128B-LABEL: func @fold_unit_dim_add( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @fold_unit_dim_mulf(%arg0 : vector<8x[2]x1xf32>, @@ -352,6 +412,9 @@ func.func @fold_unit_dim_mulf(%arg0 : vector<8x[2]x1xf32>, // CHECK: %[[VAL_4:.*]] = arith.mulf %[[VAL_2]], %[[VAL_3]] : vector<8x[2]xf32> // CHECK: return %[[VAL_4]] : vector<8x[2]xf32> +// CHECK-128B-LABEL: func @fold_unit_dim_mulf( +// CHECK-128B-NOT: memref.collapse_shape + // ----- func.func @fold_unit_dim_sitofp(%arg0 : vector<8x[2]x1xi8>) -> vector<8x[2]xf32> { @@ -367,6 +430,9 @@ func.func @fold_unit_dim_sitofp(%arg0 : vector<8x[2]x1xi8>) -> vector<8x[2]xf32> // CHECK: %[[VAL_2:.*]] = arith.sitofp %[[VAL_1]] : vector<8x[2]xi8> to vector<8x[2]xf32> // CHECK: return %[[VAL_2]] : vector<8x[2]xf32> +// CHECK-128B-LABEL: func @fold_unit_dim_sitofp( +// CHECK-128B-NOT: memref.collapse_shape + // ----- // All shape casts are folded away @@ -389,3 +455,7 @@ func.func @fold_unit_dims_entirely(%arg0 : vector<8xi32>, // CHECK: %[[VAL_3:.*]] = arith.muli %[[VAL_0]], %[[VAL_1]] : vector<8xi32> // CHECK: %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_2]] : vector<8xi32> // CHECK: return %[[VAL_4]] : vector<8xi32> + +// CHECK-128B-LABEL: func @fold_unit_dims_entirely( +// CHECK-128B-NOT: memref.collapse_shape + diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp index acd38980514a5..178a58e796b24 100644 --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -466,21 +466,35 @@ struct TestFlattenVectorTransferPatterns MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( TestFlattenVectorTransferPatterns) + TestFlattenVectorTransferPatterns() = default; + TestFlattenVectorTransferPatterns( + const TestFlattenVectorTransferPatterns &pass) + : PassWrapper(pass) {} + StringRef getArgument() const final { return "test-vector-transfer-flatten-patterns"; } + StringRef getDescription() const final { return "Test patterns to rewrite contiguous row-major N-dimensional " "vector.transfer_{read,write} ops into 1D transfers"; } + void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); registry.insert(); registry.insert(); } + + Option targetVectorBitwidth{ + *this, "target-vector-bitwidth", + llvm::cl::desc( + "Minimum vector bitwidth to enable the flattening transformation"), + llvm::cl::init(std::numeric_limits::max())}; + void runOnOperation() override { RewritePatternSet patterns(&getContext()); - populateFlattenVectorTransferPatterns(patterns); + populateFlattenVectorTransferPatterns(patterns, targetVectorBitwidth); (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); } }; From 58f45d909d2a1565128846e423b480808736f214 Mon Sep 17 00:00:00 2001 From: Pranav Bhandarkar Date: Wed, 21 Feb 2024 11:28:25 -0600 Subject: [PATCH 124/351] [flang][openmp] - depend clause support in target, target enter/update/exit data constructs (#81610) This patch adds support in flang for the depend clause in target and target enter/update/exit constructs. Previously, the following line in a fortran program would have resulted in the error shown below it. !$omp target map(to:a) depend(in:a) "not yet implemented: Unhandled clause DEPEND in TARGET construct" --- flang/lib/Lower/OpenMP/OpenMP.cpp | 30 ++++--- flang/lib/Semantics/check-omp-structure.cpp | 8 ++ flang/test/Lower/OpenMP/target.f90 | 85 +++++++++++++++++++ .../Semantics/OpenMP/clause-validity01.f90 | 1 + 4 files changed, 113 insertions(+), 11 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 3aefad6cf0ec1..89bd5ed080b20 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -761,7 +761,8 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::StatementContext stmtCtx; mlir::Value ifClauseOperand, deviceOperand; mlir::UnitAttr nowaitAttr; - llvm::SmallVector mapOperands; + llvm::SmallVector mapOperands, dependOperands; + llvm::SmallVector dependTypeOperands; Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName; llvm::omp::Directive directive; @@ -784,6 +785,7 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter, ClauseProcessor cp(converter, semaCtx, clauseList); cp.processIf(directiveName, ifClauseOperand); cp.processDevice(stmtCtx, deviceOperand); + cp.processDepend(dependTypeOperands, dependOperands); cp.processNowait(nowaitAttr); if constexpr (std::is_same_v) { @@ -796,12 +798,13 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter, cp.processMap(currentLocation, directive, stmtCtx, mapOperands); } - cp.processTODO(currentLocation, - directive); - - return firOpBuilder.create(currentLocation, ifClauseOperand, - deviceOperand, nullptr, mlir::ValueRange(), - nowaitAttr, mapOperands); + return firOpBuilder.create( + currentLocation, ifClauseOperand, deviceOperand, + dependTypeOperands.empty() + ? nullptr + : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(), + dependTypeOperands), + dependOperands, nowaitAttr, mapOperands); } // This functions creates a block for the body of the targetOp's region. It adds @@ -968,7 +971,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::StatementContext stmtCtx; mlir::Value ifClauseOperand, deviceOperand, threadLimitOperand; mlir::UnitAttr nowaitAttr; - llvm::SmallVector mapOperands; + llvm::SmallVector dependTypeOperands; + llvm::SmallVector mapOperands, dependOperands; llvm::SmallVector mapSymTypes; llvm::SmallVector mapSymLocs; llvm::SmallVector mapSymbols; @@ -978,11 +982,12 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, ifClauseOperand); cp.processDevice(stmtCtx, deviceOperand); cp.processThreadLimit(stmtCtx, threadLimitOperand); + cp.processDepend(dependTypeOperands, dependOperands); cp.processNowait(nowaitAttr); cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes, &mapSymLocs, &mapSymbols); + cp.processTODO( currentLocation, llvm::omp::Directive::OMPD_target); - // 5.8.1 Implicit Data-Mapping Attribute Rules // The following code follows the implicit data-mapping rules to map all the // symbols used inside the region that have not been explicitly mapped using @@ -1066,7 +1070,11 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, auto targetOp = converter.getFirOpBuilder().create( currentLocation, ifClauseOperand, deviceOperand, threadLimitOperand, - nullptr, mlir::ValueRange(), nowaitAttr, mapOperands); + dependTypeOperands.empty() + ? nullptr + : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(), + dependTypeOperands), + dependOperands, nowaitAttr, mapOperands); genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes, mapSymLocs, mapSymbols, currentLocation); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 03423de0c6104..54101ab8a42bb 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2815,6 +2815,14 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Device &x) { void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) { CheckAllowed(llvm::omp::Clause::OMPC_depend); + if ((std::holds_alternative(x.v.u) || + std::holds_alternative(x.v.u)) && + GetContext().directive != llvm::omp::OMPD_ordered) { + context_.Say(GetContext().clauseSource, + "DEPEND(SOURCE) or DEPEND(SINK : vec) can be used only with the ordered" + " directive. Used here in the %s construct."_err_en_US, + parser::ToUpperCaseLetters(getDirectiveName(GetContext().directive))); + } if (const auto *inOut{std::get_if(&x.v.u)}) { const auto &designators{std::get>(inOut->t)}; for (const auto &ele : designators) { diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index fa07b7f71d514..030533e1a0455 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -14,6 +14,26 @@ subroutine omp_target_enter_simple return end subroutine omp_target_enter_simple +!=============================================================================== +! Target_Enter `depend` clause +!=============================================================================== + +!CHECK-LABEL: func.func @_QPomp_target_enter_depend() { +subroutine omp_target_enter_depend + !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_enter_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + integer :: a(1024) + + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !$omp task depend(out: a) + call foo(a) + !$omp end task + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_enter_data map_entries(%[[MAP]] : !fir.ref>) depend(taskdependin -> %[[A]]#1 : !fir.ref>) + !$omp target enter data map(to: a) depend(in: a) + return +end subroutine omp_target_enter_depend + !=============================================================================== ! Target_Enter Map types !=============================================================================== @@ -134,6 +154,45 @@ subroutine omp_target_exit_device !$omp target exit data map(from: a) device(d) end subroutine omp_target_exit_device +!=============================================================================== +! Target_Exit `depend` clause +!=============================================================================== + +!CHECK-LABEL: func.func @_QPomp_target_exit_depend() { +subroutine omp_target_exit_depend + !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_exit_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + integer :: a(1024) + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !$omp task depend(out: a) + call foo(a) + !$omp end task + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_exit_data map_entries(%[[MAP]] : !fir.ref>) depend(taskdependout -> %[[A]]#1 : !fir.ref>) + !$omp target exit data map(from: a) depend(out: a) +end subroutine omp_target_exit_depend + + +!=============================================================================== +! Target_Update `depend` clause +!=============================================================================== + +!CHECK-LABEL: func.func @_QPomp_target_update_depend() { +subroutine omp_target_update_depend + !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_update_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + integer :: a(1024) + + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !$omp task depend(out: a) + call foo(a) + !$omp end task + + !CHECK: %[[BOUNDS:.*]] = omp.bounds + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[A]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_update_data motion_entries(%[[MAP]] : !fir.ref>) depend(taskdependin -> %[[A]]#1 : !fir.ref>) + !$omp target update to(a) depend(in:a) +end subroutine omp_target_update_depend + !=============================================================================== ! Target_Update `to` clause !=============================================================================== @@ -295,6 +354,32 @@ subroutine omp_target !CHECK: } end subroutine omp_target +!=============================================================================== +! Target with region `depend` clause +!=============================================================================== + +!CHECK-LABEL: func.func @_QPomp_target_depend() { +subroutine omp_target_depend + !CHECK: %[[EXTENT_A:.*]] = arith.constant 1024 : index + !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + integer :: a(1024) + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !$omp task depend(out: a) + call foo(a) + !$omp end task + !CHECK: %[[STRIDE_A:.*]] = arith.constant 1 : index + !CHECK: %[[LBOUND_A:.*]] = arith.constant 0 : index + !CHECK: %[[UBOUND_A:.*]] = arith.subi %c1024, %c1 : index + !CHECK: %[[BOUNDS_A:.*]] = omp.bounds lower_bound(%[[LBOUND_A]] : index) upper_bound(%[[UBOUND_A]] : index) extent(%[[EXTENT_A]] : index) stride(%[[STRIDE_A]] : index) start_idx(%[[STRIDE_A]] : index) + !CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[A]]#0 : !fir.ref>, !fir.array<1024xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target map_entries(%[[MAP_A]] -> %[[BB0_ARG:.*]] : !fir.ref>) depend(taskdependin -> %[[A]]#1 : !fir.ref>) { + !$omp target map(tofrom: a) depend(in: a) + a(1) = 10 + !CHECK: omp.terminator + !$omp end target + !CHECK: } + end subroutine omp_target_depend + !=============================================================================== ! Target implicit capture !=============================================================================== diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90 index 3fa86ed105a29..d9573a81821f3 100644 --- a/flang/test/Semantics/OpenMP/clause-validity01.f90 +++ b/flang/test/Semantics/OpenMP/clause-validity01.f90 @@ -481,6 +481,7 @@ !$omp taskyield !$omp barrier !$omp taskwait + !ERROR: DEPEND(SOURCE) or DEPEND(SINK : vec) can be used only with the ordered directive. Used here in the TASKWAIT construct. !$omp taskwait depend(source) ! !$omp taskwait depend(sink:i-1) ! !$omp target enter data map(to:arrayA) map(alloc:arrayB) From cc374d8056990a4c6df44173ad7ef59474ba498b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Feb 2024 11:33:32 -0600 Subject: [PATCH 125/351] [OpenMP] Remove `register_requires` global constructor (#80460) Summary: Currently, OpenMP handles the `omp requires` clause by emitting a global constructor into the runtime for every translation unit that requires it. However, this is not a great solution because it prevents us from having a defined order in which the runtime is accessed and used. This patch changes the approach to no longer use global constructors, but to instead group the flag with the other offloading entires that we already handle. This has the effect of still registering each flag per requires TU, but now we have a single constructor that handles everything. This function removes support for the old `__tgt_register_requires` and replaces it with a warning message. We just had a recent release, and the OpenMP policy for the past four releases since we switched to LLVM is that we do not provide strict backwards compatibility between major LLVM releases now that the library is versioned. This means that a user will need to recompile if they have an old binary that relied on `register_requires` having the old behavior. It is important that we actively deprecate this, as otherwise it would not solve the problem of having no defined init and shutdown order for `libomptarget`. The problem of `libomptarget` not having a define init and shutdown order cascades into a lot of other issues so I have a strong incentive to be rid of it. It is worth noting that the current `__tgt_offload_entry` only has space for a 32-bit integer here. I am planning to overhaul these at some point as well. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 38 -- clang/lib/CodeGen/CGOpenMPRuntime.h | 4 - clang/lib/CodeGen/CodeGenModule.cpp | 4 - clang/test/OpenMP/bug60602.cpp | 7 - clang/test/OpenMP/distribute_codegen.cpp | 14 - .../distribute_firstprivate_codegen.cpp | 32 +- .../OpenMP/distribute_lastprivate_codegen.cpp | 32 +- .../distribute_parallel_for_codegen.cpp | 28 - ...bute_parallel_for_firstprivate_codegen.cpp | 32 +- .../distribute_parallel_for_if_codegen.cpp | 7 - ...ibute_parallel_for_lastprivate_codegen.cpp | 32 +- ...ibute_parallel_for_num_threads_codegen.cpp | 532 +++++++-------- ...istribute_parallel_for_private_codegen.cpp | 32 +- ...tribute_parallel_for_proc_bind_codegen.cpp | 7 - .../distribute_parallel_for_simd_codegen.cpp | 28 - ...parallel_for_simd_firstprivate_codegen.cpp | 60 +- ...istribute_parallel_for_simd_if_codegen.cpp | 28 - ..._parallel_for_simd_lastprivate_codegen.cpp | 76 +-- ..._parallel_for_simd_num_threads_codegen.cpp | 644 +++++++++--------- ...bute_parallel_for_simd_private_codegen.cpp | 28 - ...te_parallel_for_simd_proc_bind_codegen.cpp | 7 - .../OpenMP/distribute_private_codegen.cpp | 32 +- clang/test/OpenMP/distribute_simd_codegen.cpp | 64 +- .../distribute_simd_firstprivate_codegen.cpp | 60 +- .../distribute_simd_lastprivate_codegen.cpp | 76 +-- .../distribute_simd_private_codegen.cpp | 28 - .../distribute_simd_reduction_codegen.cpp | 21 - clang/test/OpenMP/map_struct_ordering.cpp | 7 - clang/test/OpenMP/nvptx_lambda_capturing.cpp | 43 +- .../OpenMP/openmp_offload_registration.cpp | 9 - clang/test/OpenMP/reduction_implicit_map.cpp | 42 +- clang/test/OpenMP/target_codegen.cpp | 8 - .../OpenMP/target_codegen_global_capture.cpp | 14 - .../OpenMP/target_codegen_registration.cpp | 54 +- clang/test/OpenMP/target_depend_codegen.cpp | 8 - .../OpenMP/target_firstprivate_codegen.cpp | 28 - .../OpenMP/target_has_device_addr_codegen.cpp | 25 +- .../target_has_device_addr_codegen_01.cpp | 19 +- .../OpenMP/target_is_device_ptr_codegen.cpp | 164 ++--- clang/test/OpenMP/target_map_codegen_03.cpp | 14 - clang/test/OpenMP/target_map_codegen_hold.cpp | 28 - .../OpenMP/target_map_deref_array_codegen.cpp | 7 - .../OpenMP/target_map_member_expr_codegen.cpp | 27 +- .../target_offload_mandatory_codegen.cpp | 7 - .../target_ompx_dyn_cgroup_mem_codegen.cpp | 122 ++-- clang/test/OpenMP/target_parallel_codegen.cpp | 94 ++- .../target_parallel_codegen_registration.cpp | 4 +- .../OpenMP/target_parallel_depend_codegen.cpp | 8 - .../OpenMP/target_parallel_for_codegen.cpp | 284 ++++---- ...rget_parallel_for_codegen_registration.cpp | 4 +- .../target_parallel_for_depend_codegen.cpp | 4 - .../target_parallel_for_simd_codegen.cpp | 224 +++--- ...parallel_for_simd_codegen_registration.cpp | 4 +- ...arget_parallel_for_simd_depend_codegen.cpp | 4 - ...target_parallel_generic_loop_codegen-1.cpp | 125 ++-- ...target_parallel_generic_loop_codegen-2.cpp | 14 - ...t_parallel_generic_loop_depend_codegen.cpp | 7 - ...l_generic_loop_uses_allocators_codegen.cpp | 7 - .../OpenMP/target_parallel_if_codegen.cpp | 14 - .../target_parallel_num_threads_codegen.cpp | 14 - clang/test/OpenMP/target_simd_codegen.cpp | 4 - .../target_simd_codegen_registration.cpp | 4 +- .../OpenMP/target_simd_depend_codegen.cpp | 4 - .../OpenMP/target_task_affinity_codegen.cpp | 142 ++-- clang/test/OpenMP/target_teams_codegen.cpp | 14 - .../target_teams_codegen_registration.cpp | 4 +- .../OpenMP/target_teams_depend_codegen.cpp | 4 - .../target_teams_distribute_codegen.cpp | 142 ++-- ..._teams_distribute_codegen_registration.cpp | 4 +- ...rget_teams_distribute_collapse_codegen.cpp | 28 - ...target_teams_distribute_depend_codegen.cpp | 4 - ...teams_distribute_dist_schedule_codegen.cpp | 28 - ..._teams_distribute_firstprivate_codegen.cpp | 29 +- ...t_teams_distribute_lastprivate_codegen.cpp | 32 +- ..._teams_distribute_parallel_for_codegen.cpp | 14 - ...stribute_parallel_for_collapse_codegen.cpp | 28 - ...distribute_parallel_for_depend_codegen.cpp | 4 - ...ute_parallel_for_dist_schedule_codegen.cpp | 28 - ...bute_parallel_for_firstprivate_codegen.cpp | 31 +- ...ams_distribute_parallel_for_if_codegen.cpp | 7 - ...ibute_parallel_for_lastprivate_codegen.cpp | 32 +- ..._distribute_parallel_for_order_codegen.cpp | 7 - ...istribute_parallel_for_private_codegen.cpp | 31 +- ...tribute_parallel_for_proc_bind_codegen.cpp | 7 - ...tribute_parallel_for_reduction_codegen.cpp | 61 +- ...stribute_parallel_for_schedule_codegen.cpp | 56 -- ...s_distribute_parallel_for_simd_codegen.cpp | 14 - ...parallel_for_simd_codegen_registration.cpp | 4 +- ...ute_parallel_for_simd_collapse_codegen.cpp | 28 - ...ibute_parallel_for_simd_depend_codegen.cpp | 4 - ...arallel_for_simd_dist_schedule_codegen.cpp | 28 - ...parallel_for_simd_firstprivate_codegen.cpp | 35 +- ...istribute_parallel_for_simd_if_codegen.cpp | 40 +- ..._parallel_for_simd_lastprivate_codegen.cpp | 76 +-- ...bute_parallel_for_simd_private_codegen.cpp | 21 - ...te_parallel_for_simd_proc_bind_codegen.cpp | 7 - ...te_parallel_for_simd_reduction_codegen.cpp | 61 +- ...ute_parallel_for_simd_schedule_codegen.cpp | 56 -- ...arget_teams_distribute_private_codegen.cpp | 29 +- ...get_teams_distribute_reduction_codegen.cpp | 221 +++--- .../target_teams_distribute_simd_codegen.cpp | 308 ++++----- ...s_distribute_simd_codegen_registration.cpp | 4 +- ...teams_distribute_simd_collapse_codegen.cpp | 28 - ...t_teams_distribute_simd_depend_codegen.cpp | 4 - ..._distribute_simd_dist_schedule_codegen.cpp | 28 - ...s_distribute_simd_firstprivate_codegen.cpp | 33 +- ...ms_distribute_simd_lastprivate_codegen.cpp | 76 +-- ..._teams_distribute_simd_private_codegen.cpp | 21 - ...eams_distribute_simd_reduction_codegen.cpp | 41 +- .../target_teams_generic_loop_codegen-1.cpp | 14 - ...et_teams_generic_loop_collapse_codegen.cpp | 28 - ...rget_teams_generic_loop_depend_codegen.cpp | 7 - .../target_teams_generic_loop_if_codegen.cpp | 7 - ...arget_teams_generic_loop_order_codegen.cpp | 7 - ...get_teams_generic_loop_private_codegen.cpp | 31 +- ...t_teams_generic_loop_reduction_codegen.cpp | 61 +- ...s_generic_loop_uses_allocators_codegen.cpp | 7 - .../test/OpenMP/target_teams_map_codegen.cpp | 78 +-- .../OpenMP/target_teams_num_teams_codegen.cpp | 14 - .../target_teams_thread_limit_codegen.cpp | 14 - clang/test/OpenMP/teams_codegen.cpp | 42 -- .../test/OpenMP/teams_distribute_codegen.cpp | 100 +-- .../teams_distribute_collapse_codegen.cpp | 56 +- ...teams_distribute_dist_schedule_codegen.cpp | 104 ++- .../teams_distribute_firstprivate_codegen.cpp | 37 +- .../teams_distribute_lastprivate_codegen.cpp | 92 +-- .../teams_distribute_parallel_for_codegen.cpp | 116 +--- ...stribute_parallel_for_collapse_codegen.cpp | 68 +- ...distribute_parallel_for_copyin_codegen.cpp | 59 +- ...ute_parallel_for_dist_schedule_codegen.cpp | 140 ++-- ...bute_parallel_for_firstprivate_codegen.cpp | 47 +- ...ams_distribute_parallel_for_if_codegen.cpp | 67 +- ...ibute_parallel_for_lastprivate_codegen.cpp | 120 ++-- ...ibute_parallel_for_num_threads_codegen.cpp | 262 ++++--- ...istribute_parallel_for_private_codegen.cpp | 47 +- ...tribute_parallel_for_proc_bind_codegen.cpp | 27 +- ...tribute_parallel_for_reduction_codegen.cpp | 113 ++- ...stribute_parallel_for_schedule_codegen.cpp | 424 +++++------- ...s_distribute_parallel_for_simd_codegen.cpp | 116 +--- ...ute_parallel_for_simd_collapse_codegen.cpp | 68 +- ...arallel_for_simd_dist_schedule_codegen.cpp | 140 ++-- ...parallel_for_simd_firstprivate_codegen.cpp | 51 +- ...istribute_parallel_for_simd_if_codegen.cpp | 276 ++++---- ..._parallel_for_simd_lastprivate_codegen.cpp | 164 ++--- ..._parallel_for_simd_num_threads_codegen.cpp | 314 ++++----- ...bute_parallel_for_simd_private_codegen.cpp | 21 - ...te_parallel_for_simd_proc_bind_codegen.cpp | 27 +- ...te_parallel_for_simd_reduction_codegen.cpp | 113 ++- ...ute_parallel_for_simd_schedule_codegen.cpp | 424 +++++------- .../teams_distribute_private_codegen.cpp | 37 +- .../teams_distribute_reduction_codegen.cpp | 73 +- .../OpenMP/teams_distribute_simd_codegen.cpp | 92 +-- ...teams_distribute_simd_collapse_codegen.cpp | 56 +- ..._distribute_simd_dist_schedule_codegen.cpp | 104 ++- ...s_distribute_simd_firstprivate_codegen.cpp | 41 +- ...ms_distribute_simd_lastprivate_codegen.cpp | 136 ++-- .../teams_distribute_simd_private_codegen.cpp | 21 - ...eams_distribute_simd_reduction_codegen.cpp | 73 +- .../OpenMP/teams_firstprivate_codegen.cpp | 166 ++--- .../OpenMP/teams_generic_loop_codegen-1.cpp | 116 +--- .../teams_generic_loop_collapse_codegen.cpp | 68 +- .../teams_generic_loop_private_codegen.cpp | 47 +- .../teams_generic_loop_reduction_codegen.cpp | 113 ++- clang/test/OpenMP/teams_private_codegen.cpp | 136 ++-- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 12 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 40 +- .../Frontend/OpenMPIRBuilderTest.cpp | 40 -- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 23 +- mlir/test/Target/LLVMIR/openmp-llvm.mlir | 6 - openmp/libomptarget/include/Shared/APITypes.h | 2 +- openmp/libomptarget/include/omptarget.h | 4 +- openmp/libomptarget/src/PluginManager.cpp | 6 + openmp/libomptarget/src/interface.cpp | 4 +- openmp/libomptarget/src/omptarget.cpp | 2 +- .../libomptarget/test/offloading/requires.c | 39 +- 175 files changed, 3622 insertions(+), 7044 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 4855e7410a015..a7b72df6d9f89 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -10100,44 +10100,6 @@ bool CGOpenMPRuntime::markAsGlobalTarget(GlobalDecl GD) { return !AlreadyEmittedTargetDecls.insert(D).second; } -llvm::Function *CGOpenMPRuntime::emitRequiresDirectiveRegFun() { - // If we don't have entries or if we are emitting code for the device, we - // don't need to do anything. - if (CGM.getLangOpts().OMPTargetTriples.empty() || - CGM.getLangOpts().OpenMPSimd || CGM.getLangOpts().OpenMPIsTargetDevice || - (OMPBuilder.OffloadInfoManager.empty() && - !HasEmittedDeclareTargetRegion && !HasEmittedTargetRegion)) - return nullptr; - - // Create and register the function that handles the requires directives. - ASTContext &C = CGM.getContext(); - - llvm::Function *RequiresRegFn; - { - CodeGenFunction CGF(CGM); - const auto &FI = CGM.getTypes().arrangeNullaryFunction(); - llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FI); - std::string ReqName = getName({"omp_offloading", "requires_reg"}); - RequiresRegFn = CGM.CreateGlobalInitOrCleanUpFunction(FTy, ReqName, FI); - CGF.StartFunction(GlobalDecl(), C.VoidTy, RequiresRegFn, FI, {}); - // TODO: check for other requires clauses. - // The requires directive takes effect only when a target region is - // present in the compilation unit. Otherwise it is ignored and not - // passed to the runtime. This avoids the runtime from throwing an error - // for mismatching requires clauses across compilation units that don't - // contain at least 1 target region. - assert((HasEmittedTargetRegion || HasEmittedDeclareTargetRegion || - !OMPBuilder.OffloadInfoManager.empty()) && - "Target or declare target region expected."); - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___tgt_register_requires), - llvm::ConstantInt::get( - CGM.Int64Ty, OMPBuilder.Config.getRequiresFlags())); - CGF.FinishFunction(); - } - return RequiresRegFn; -} - void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index b01b39abd1606..c3206427b143e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1407,10 +1407,6 @@ class CGOpenMPRuntime { /// \param GD Global to scan. virtual bool emitTargetGlobal(GlobalDecl GD); - /// Creates and returns a registration function for when at least one - /// requires directives was used in the current module. - llvm::Function *emitRequiresDirectiveRegFun(); - /// Creates all the offload entries in the current compilation unit /// along with the associated metadata. void createOffloadEntriesAndInfoMetadata(); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 836cd34a16c0a..77fb3a62b356e 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -838,10 +838,6 @@ void CodeGenModule::Release() { AddGlobalCtor(CudaCtorFunction); } if (OpenMPRuntime) { - if (llvm::Function *OpenMPRequiresDirectiveRegFun = - OpenMPRuntime->emitRequiresDirectiveRegFun()) { - AddGlobalCtor(OpenMPRequiresDirectiveRegFun, 0); - } OpenMPRuntime->createOffloadEntriesAndInfoMetadata(); OpenMPRuntime->clear(); } diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp index 2fbfdfde07a0c..3ecc70cab778a 100644 --- a/clang/test/OpenMP/bug60602.cpp +++ b/clang/test/OpenMP/bug60602.cpp @@ -569,10 +569,3 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) { // CHECK: omp.precond.end: // CHECK-NEXT: ret void // -// -// CHECK-LABEL: define internal void @.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp index e3b43002a0518..31ec6ff911905 100644 --- a/clang/test/OpenMP/distribute_codegen.cpp +++ b/clang/test/OpenMP/distribute_codegen.cpp @@ -1037,13 +1037,6 @@ int fint(void) { return ftemplate(); } // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z23without_schedule_clausePfS_S_S_ // CHECK3-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1953,13 +1946,6 @@ int fint(void) { return ftemplate(); } // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56 // CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: diff --git a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp index 361e26bc2984c..800a002e43968 100644 --- a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp @@ -304,13 +304,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -476,13 +469,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -794,7 +780,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1148,13 +1134,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1464,7 +1443,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1815,10 +1794,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp index e005de30e14d1..772372076e947 100644 --- a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp @@ -291,13 +291,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -460,13 +453,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -797,7 +783,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1169,13 +1155,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1504,7 +1483,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1873,10 +1852,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp index 7bdc4c5ab21a7..95adefa8020f6 100644 --- a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp @@ -2538,13 +2538,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -4265,13 +4258,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -8886,13 +8872,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -13404,10 +13383,3 @@ int main() { // CHECK11: omp.precond.end: // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp index 9f900facc6a54..46c115e40e435 100644 --- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp @@ -504,13 +504,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -759,13 +752,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK8-LABEL: define {{[^@]+}}@main // CHECK8-SAME: () #[[ATTR0:[0-9]+]] { // CHECK8-NEXT: entry: @@ -1207,7 +1193,7 @@ int main() { // // // CHECK8-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK8-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK8-SAME: () #[[ATTR1]] comdat { // CHECK8-NEXT: entry: // CHECK8-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK8-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1685,13 +1671,6 @@ int main() { // CHECK8-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK8-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK8-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@main // CHECK10-SAME: () #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: @@ -2127,7 +2106,7 @@ int main() { // // // CHECK10-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK10-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK10-SAME: () #[[ATTR1]] comdat { // CHECK10-NEXT: entry: // CHECK10-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK10-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2598,10 +2577,3 @@ int main() { // CHECK10-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK10-NEXT: ret void // -// -// CHECK10-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK10-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK10-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp index 83c9f504ccaca..846e7beb5d92f 100644 --- a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp @@ -1609,10 +1609,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp index 8c44a1e71ae79..aa981f606cc87 100644 --- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp @@ -462,13 +462,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -734,13 +727,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1219,7 +1205,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1733,13 +1719,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2212,7 +2191,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2719,10 +2698,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp index 9f769ca2886fe..5d9244268d554 100644 --- a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp @@ -112,7 +112,7 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK1-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -145,16 +145,16 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: lpad: // CHECK1-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: cleanup +// CHECK1-NEXT: cleanup // CHECK1-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK1-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK1-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK1-NEXT: br label [[EH_RESUME:%.*]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -198,21 +198,21 @@ int main() { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK1-NEXT: [[CALL6:%.*]] = invoke noundef i32 @_Z5tmainIcLi5EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont5: // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK1-NEXT: [[CALL8:%.*]] = invoke noundef i32 @_Z5tmainI1SLi1EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont7: // CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK1-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK1-NEXT: ret i32 [[TMP41]] // CHECK1: eh.resume: @@ -237,7 +237,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -250,14 +250,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR2:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -318,7 +318,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -373,7 +373,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -390,21 +390,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7:[0-9]+]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -413,7 +413,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -479,7 +479,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -534,7 +534,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -551,14 +551,14 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -594,7 +594,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -627,14 +627,14 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK1-SAME: () #[[ATTR7]] comdat { +// CHECK1-SAME: () #[[ATTR6]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -670,7 +670,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -703,24 +703,24 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -734,7 +734,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -743,14 +743,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -811,7 +811,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -866,7 +866,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -883,21 +883,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -958,7 +958,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1013,7 +1013,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1030,21 +1030,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1105,7 +1105,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1160,7 +1160,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1177,21 +1177,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1233,14 +1233,14 @@ int main() { // CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: // CHECK1-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK1-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] // CHECK1: invoke.cont2: // CHECK1-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK1-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]) -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]] // CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 @@ -1260,14 +1260,14 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP14:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1322,7 +1322,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1339,19 +1339,12 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: @@ -1371,7 +1364,7 @@ int main() { // CHECK5-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK5-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK5-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK5-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -1404,16 +1397,16 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: lpad: // CHECK5-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: cleanup +// CHECK5-NEXT: cleanup // CHECK5-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK5-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK5-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK5-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK5-NEXT: br label [[EH_RESUME:%.*]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -1457,21 +1450,21 @@ int main() { // CHECK5-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK5-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK5-NEXT: [[CALL6:%.*]] = invoke noundef i32 @_Z5tmainIcLi5EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont5: // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK5-NEXT: [[CALL8:%.*]] = invoke noundef i32 @_Z5tmainI1SLi1EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont7: // CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK5-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK5-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK5-NEXT: ret i32 [[TMP41]] // CHECK5: eh.resume: @@ -1496,7 +1489,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1509,14 +1502,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK5-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR2:[0-9]+]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1577,7 +1570,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1632,7 +1625,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -1649,21 +1642,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7:[0-9]+]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK5-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -1672,7 +1665,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1738,7 +1731,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1793,7 +1786,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -1810,14 +1803,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK5-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -1853,7 +1846,7 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -1886,14 +1879,14 @@ int main() { // CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK5-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK5-SAME: () #[[ATTR7]] comdat { +// CHECK5-SAME: () #[[ATTR6]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -1929,7 +1922,7 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -1962,24 +1955,24 @@ int main() { // CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK5-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -1993,14 +1986,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2061,7 +2054,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2116,7 +2109,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2133,21 +2126,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2208,7 +2201,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2263,7 +2256,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2280,21 +2273,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2355,7 +2348,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2410,7 +2403,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2427,21 +2420,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2483,14 +2476,14 @@ int main() { // CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK5: omp.inner.for.body: // CHECK5-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK5-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] // CHECK5: invoke.cont2: // CHECK5-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK5-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]) -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]] // CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK5-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 @@ -2510,14 +2503,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP14:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2572,7 +2565,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2589,14 +2582,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -2604,13 +2597,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: @@ -2630,7 +2616,7 @@ int main() { // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK9-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK9-NEXT: [[CALL:%.*]] = invoke noundef i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -2663,16 +2649,16 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: lpad: // CHECK9-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: cleanup +// CHECK9-NEXT: cleanup // CHECK9-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK9-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK9-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK9-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK9-NEXT: br label [[EH_RESUME:%.*]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -2716,21 +2702,21 @@ int main() { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK9-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK9-NEXT: [[CALL6:%.*]] = invoke noundef i32 @_Z5tmainIcLi5EEiv() -// CHECK9-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK9-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK9: invoke.cont5: // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK9-NEXT: [[CALL8:%.*]] = invoke noundef i32 @_Z5tmainI1SLi1EEiv() -// CHECK9-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK9-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK9: invoke.cont7: // CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK9-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK9-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP41]] // CHECK9: eh.resume: @@ -2755,7 +2741,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -2768,14 +2754,14 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK9-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: () #[[ATTR2:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2836,7 +2822,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2891,7 +2877,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -2908,21 +2894,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7:[0-9]+]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK9-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK9-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK9-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK9-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK9-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -2931,7 +2917,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2997,7 +2983,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3052,7 +3038,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3069,14 +3055,14 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -3112,7 +3098,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -3145,14 +3131,14 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: ret i32 0 // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK9-SAME: () #[[ATTR7]] comdat { +// CHECK9-SAME: () #[[ATTR6]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -3188,7 +3174,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -3221,24 +3207,24 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: ret i32 0 // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -3252,7 +3238,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -3261,14 +3247,14 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3329,7 +3315,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3384,7 +3370,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3401,21 +3387,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3476,7 +3462,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3531,7 +3517,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3548,21 +3534,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3623,7 +3609,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3678,7 +3664,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3695,21 +3681,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3751,14 +3737,14 @@ int main() { // CHECK9-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK9: omp.inner.for.body: // CHECK9-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: [[CALL:%.*]] = invoke noundef i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK9-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] +// CHECK9-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] // CHECK9: invoke.cont2: // CHECK9-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK9-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]) -// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]] // CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK9-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 @@ -3778,14 +3764,14 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP14:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3840,7 +3826,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3857,19 +3843,12 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK9-NEXT: unreachable // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: @@ -3889,7 +3868,7 @@ int main() { // CHECK13-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK13-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK13-NEXT: [[CALL:%.*]] = invoke noundef i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK13-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -3922,16 +3901,16 @@ int main() { // CHECK13-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK13-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: lpad: // CHECK13-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: cleanup +// CHECK13-NEXT: cleanup // CHECK13-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK13-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK13-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK13-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK13-NEXT: br label [[EH_RESUME:%.*]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -3975,21 +3954,21 @@ int main() { // CHECK13-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK13-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK13: omp_offload.failed3: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK13: omp_offload.cont4: // CHECK13-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK13-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK13-NEXT: [[CALL6:%.*]] = invoke noundef i32 @_Z5tmainIcLi5EEiv() -// CHECK13-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK13-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK13: invoke.cont5: // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK13-NEXT: [[CALL8:%.*]] = invoke noundef i32 @_Z5tmainI1SLi1EEiv() -// CHECK13-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK13-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK13: invoke.cont7: // CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK13-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP41]] // CHECK13: eh.resume: @@ -4014,7 +3993,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4027,14 +4006,14 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK13-SAME: () #[[ATTR2:[0-9]+]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4095,7 +4074,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4150,7 +4129,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -4167,21 +4146,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7:[0-9]+]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK13-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK13-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK13-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK13-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK13-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK13-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK13-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK13-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -4190,7 +4169,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4256,7 +4235,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4311,7 +4290,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -4328,14 +4307,14 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK13-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -4371,7 +4350,7 @@ int main() { // CHECK13-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK13-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -4404,14 +4383,14 @@ int main() { // CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK13-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK13: omp_offload.failed3: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK13: omp_offload.cont4: // CHECK13-NEXT: ret i32 0 // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK13-SAME: () #[[ATTR7]] comdat { +// CHECK13-SAME: () #[[ATTR6]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -4447,7 +4426,7 @@ int main() { // CHECK13-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK13-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -4480,24 +4459,24 @@ int main() { // CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK13-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK13: omp_offload.failed3: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK13: omp_offload.cont4: // CHECK13-NEXT: ret i32 0 // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -4511,14 +4490,14 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4579,7 +4558,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4634,7 +4613,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -4651,21 +4630,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4726,7 +4705,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4781,7 +4760,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -4798,21 +4777,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4873,7 +4852,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4928,7 +4907,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -4945,21 +4924,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5001,14 +4980,14 @@ int main() { // CHECK13-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK13: omp.inner.for.body: // CHECK13-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: [[CALL:%.*]] = invoke noundef i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK13-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] +// CHECK13-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] // CHECK13: invoke.cont2: // CHECK13-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK13-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]) -// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK13-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 @@ -5028,14 +5007,14 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP14:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5090,7 +5069,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5107,24 +5086,17 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: ret void // -// -// CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK13-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK13-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp index 288ebad9b673a..249609c7d831c 100644 --- a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp @@ -317,13 +317,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -502,13 +495,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -835,7 +821,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1220,13 +1206,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1547,7 +1526,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1925,10 +1904,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp index 3994714891b5b..8784611d8399e 100644 --- a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp @@ -586,10 +586,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp index 5a4285bb95e35..e0618cb992459 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp @@ -2706,13 +2706,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -4601,13 +4594,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -9597,13 +9583,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -14437,13 +14416,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp index 86b097256edc1..5c9b2aa1f47fb 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp @@ -517,13 +517,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -786,13 +779,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1284,7 +1270,7 @@ int main() { // // // CHECK8-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK8-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK8-SAME: () #[[ATTR1]] comdat { // CHECK8-NEXT: entry: // CHECK8-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK8-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1776,13 +1762,6 @@ int main() { // CHECK8-NEXT: ret void // // -// CHECK8-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK8-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK8-NEXT: entry: -// CHECK8-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK8-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@main // CHECK10-SAME: () #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: @@ -2232,7 +2211,7 @@ int main() { // // // CHECK10-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK10-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK10-SAME: () #[[ATTR1]] comdat { // CHECK10-NEXT: entry: // CHECK10-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK10-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2718,13 +2697,6 @@ int main() { // CHECK10-NEXT: ret void // // -// CHECK10-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK10-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK10-NEXT: entry: -// CHECK10-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK10-NEXT: ret void -// -// // CHECK12-LABEL: define {{[^@]+}}@main // CHECK12-SAME: () #[[ATTR0:[0-9]+]] { // CHECK12-NEXT: entry: @@ -2804,11 +2776,11 @@ int main() { // CHECK12: arraydestroy.body: // CHECK12-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK12-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK12-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4:[0-9]+]] +// CHECK12-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3:[0-9]+]] // CHECK12-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK12-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK12: arraydestroy.done7: -// CHECK12-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK12-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK12-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK12-NEXT: ret i32 [[TMP14]] // @@ -2837,7 +2809,7 @@ int main() { // // // CHECK12-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK12-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK12-SAME: () #[[ATTR1]] comdat { // CHECK12-NEXT: entry: // CHECK12-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK12-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2910,11 +2882,11 @@ int main() { // CHECK12: arraydestroy.body: // CHECK12-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK12-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK12-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK12-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK12-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK12-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK12: arraydestroy.done7: -// CHECK12-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK12-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK12-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK12-NEXT: ret i32 [[TMP14]] // @@ -2925,7 +2897,7 @@ int main() { // CHECK12-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK12-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK12-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK12-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK12-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK12-NEXT: ret void // // @@ -2992,7 +2964,7 @@ int main() { // CHECK12-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK12-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK12-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK12-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK12-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK12-NEXT: ret void // // @@ -3107,11 +3079,11 @@ int main() { // CHECK14: arraydestroy.body: // CHECK14-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK14-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK14-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4:[0-9]+]] +// CHECK14-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3:[0-9]+]] // CHECK14-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK14-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK14: arraydestroy.done6: -// CHECK14-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK14-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK14-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK14-NEXT: ret i32 [[TMP14]] // @@ -3140,7 +3112,7 @@ int main() { // // // CHECK14-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK14-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK14-SAME: () #[[ATTR1]] comdat { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3211,11 +3183,11 @@ int main() { // CHECK14: arraydestroy.body: // CHECK14-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK14-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK14-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK14-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK14-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK14-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK14: arraydestroy.done6: -// CHECK14-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK14-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK14-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK14-NEXT: ret i32 [[TMP14]] // @@ -3226,7 +3198,7 @@ int main() { // CHECK14-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK14-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK14-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK14-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK14-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK14-NEXT: ret void // // @@ -3293,7 +3265,7 @@ int main() { // CHECK14-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK14-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK14-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK14-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK14-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK14-NEXT: ret void // // diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp index de8d061aa6a6a..67384abc7751e 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp @@ -1719,13 +1719,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -3563,13 +3556,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -5748,13 +5734,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -7592,13 +7571,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp index 53ea358f1adc3..adef55eee1cd5 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp @@ -479,13 +479,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -765,13 +758,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1300,7 +1286,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1828,13 +1814,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2321,7 +2300,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2843,13 +2822,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2963,14 +2935,14 @@ int main() { // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) // CHECK13-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK13-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR6]], i32 0, i32 0 // CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done15: @@ -2982,11 +2954,11 @@ int main() { // CHECK13: arraydestroy.body18: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST19:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE15]] ], [ [[ARRAYDESTROY_ELEMENT20:%.*]], [[ARRAYDESTROY_BODY18]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT20]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST19]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT20]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT20]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE21:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT20]], [[ARRAY_BEGIN17]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_DONE22:%.*]], label [[ARRAYDESTROY_BODY18]] // CHECK13: arraydestroy.done22: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP20]] // @@ -3020,12 +2992,12 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3129,14 +3101,14 @@ int main() { // CHECK13: omp.arraycpy.done13: // CHECK13-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP8]], align 8 // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR6]], i32 0, i32 0 // CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done15: @@ -3147,11 +3119,11 @@ int main() { // CHECK13: arraydestroy.body17: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE15]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAY_BEGIN16]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17]] // CHECK13: arraydestroy.done21: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP19]] // @@ -3219,7 +3191,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -3368,14 +3340,14 @@ int main() { // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK15-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR6]], i32 0, i32 0 // CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done14: @@ -3387,11 +3359,11 @@ int main() { // CHECK15: arraydestroy.body17: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAY_BEGIN16]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17]] // CHECK15: arraydestroy.done21: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP20]] // @@ -3425,12 +3397,12 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3532,14 +3504,14 @@ int main() { // CHECK15: omp.arraycpy.done12: // CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP8]], align 4 // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR6]], i32 0, i32 0 // CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done14: @@ -3550,11 +3522,11 @@ int main() { // CHECK15: arraydestroy.body16: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK15: arraydestroy.done20: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP19]] // @@ -3622,7 +3594,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp index c717d827eed48..0a0ed699acb1a 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp @@ -112,7 +112,7 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: call void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[S]], i64 0) // CHECK1-NEXT: [[CALL:%.*]] = invoke signext i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[S]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -145,16 +145,16 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: lpad: // CHECK1-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: cleanup +// CHECK1-NEXT: cleanup // CHECK1-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK1-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK1-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK1-NEXT: br label [[EH_RESUME:%.*]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -198,21 +198,21 @@ int main() { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK1-NEXT: [[CALL6:%.*]] = invoke i32 @_Z5tmainIcLi5EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont5: // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK1-NEXT: [[CALL8:%.*]] = invoke i32 @_Z5tmainI1SLi1EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont7: // CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK1-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK1-NEXT: ret i32 [[TMP41]] // CHECK1: eh.resume: @@ -237,7 +237,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -250,14 +250,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR2:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -325,7 +325,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -380,7 +380,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -404,21 +404,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP13]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (i64 [[A:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -427,7 +427,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -500,7 +500,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -555,7 +555,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -579,14 +579,14 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP21]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -622,7 +622,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -655,14 +655,14 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK1-SAME: () #[[ATTR7]] comdat { +// CHECK1-SAME: () #[[ATTR6]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -698,7 +698,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -731,24 +731,24 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -762,7 +762,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -771,14 +771,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -846,7 +846,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -901,7 +901,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -925,21 +925,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP27]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1007,7 +1007,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1062,7 +1062,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1086,21 +1086,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP33]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1168,7 +1168,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1223,7 +1223,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1247,21 +1247,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP39]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1303,14 +1303,14 @@ int main() { // CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: // CHECK1-NEXT: invoke void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 23) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[CALL:%.*]] = invoke signext i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK1-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] +// CHECK1-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] // CHECK1: invoke.cont2: // CHECK1-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK1-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]] -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP42]] // CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]] // CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]] @@ -1337,14 +1337,14 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP16:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP42]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1399,7 +1399,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1423,19 +1423,12 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP45]] // CHECK1-NEXT: unreachable // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK3-NEXT: entry: @@ -1457,7 +1450,7 @@ int main() { // CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK3-NEXT: call void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[S]], i64 0) // CHECK3-NEXT: [[CALL:%.*]] = invoke signext i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[S]]) -// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK3: invoke.cont: // CHECK3-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 @@ -1476,7 +1469,7 @@ int main() { // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP2]] // CHECK3: invoke.cont1: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -1488,12 +1481,12 @@ int main() { // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3: lpad: // CHECK3-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: cleanup +// CHECK3-NEXT: cleanup // CHECK3-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[EXN_SLOT]], align 8 // CHECK3-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 1 // CHECK3-NEXT: store i32 [[TMP7]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR7:[0-9]+]] +// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5:[0-9]+]] // CHECK3-NEXT: br label [[EH_RESUME:%.*]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: store i32 100, ptr [[I]], align 4 @@ -1513,7 +1506,7 @@ int main() { // CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 0, [[MUL11]] // CHECK3-NEXT: store i32 [[ADD12]], ptr [[I7]], align 4, !llvm.access.group [[ACC_GRP6]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP6]] +// CHECK3-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP6]] // CHECK3: invoke.cont13: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] // CHECK3: omp.body.continue14: @@ -1528,15 +1521,15 @@ int main() { // CHECK3-NEXT: [[TMP13:%.*]] = load i8, ptr [[A]], align 1 // CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP13]] to i32 // CHECK3-NEXT: [[CALL19:%.*]] = invoke i32 @_Z5tmainIcLi5EEiv() -// CHECK3-NEXT: to label [[INVOKE_CONT18:%.*]] unwind label [[LPAD]] +// CHECK3-NEXT: to label [[INVOKE_CONT18:%.*]] unwind label [[LPAD]] // CHECK3: invoke.cont18: // CHECK3-NEXT: [[ADD20:%.*]] = add nsw i32 [[CONV]], [[CALL19]] // CHECK3-NEXT: [[CALL22:%.*]] = invoke i32 @_Z5tmainI1SLi1EEiv() -// CHECK3-NEXT: to label [[INVOKE_CONT21:%.*]] unwind label [[LPAD]] +// CHECK3-NEXT: to label [[INVOKE_CONT21:%.*]] unwind label [[LPAD]] // CHECK3: invoke.cont21: // CHECK3-NEXT: [[ADD23:%.*]] = add nsw i32 [[ADD20]], [[CALL22]] // CHECK3-NEXT: store i32 [[ADD23]], ptr [[RETVAL]], align 4 -// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR7]] +// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK3-NEXT: ret i32 [[TMP14]] // CHECK3: eh.resume: @@ -1547,9 +1540,9 @@ int main() { // CHECK3-NEXT: resume { ptr, i32 } [[LPAD_VAL24]] // CHECK3: terminate.lpad: // CHECK3-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: catch ptr null +// CHECK3-NEXT: catch ptr null // CHECK3-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 -// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP16]]) #[[ATTR8:[0-9]+]], !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP16]]) #[[ATTR6:[0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK3-NEXT: unreachable // // @@ -1567,7 +1560,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1580,14 +1573,14 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] comdat { -// CHECK3-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR7]] -// CHECK3-NEXT: call void @_ZSt9terminatev() #[[ATTR8]] +// CHECK3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] comdat { +// CHECK3-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] +// CHECK3-NEXT: call void @_ZSt9terminatev() #[[ATTR6]] // CHECK3-NEXT: unreachable // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat personality ptr @__gxx_personality_v0 { +// CHECK3-SAME: () #[[ATTR4:[0-9]+]] comdat personality ptr @__gxx_personality_v0 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 @@ -1615,7 +1608,7 @@ int main() { // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP9]] +// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP9]] // CHECK3: invoke.cont: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -1643,7 +1636,7 @@ int main() { // CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] // CHECK3-NEXT: store i32 [[ADD11]], ptr [[I6]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP12]] +// CHECK3-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP12]] // CHECK3: invoke.cont12: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] // CHECK3: omp.body.continue13: @@ -1658,14 +1651,14 @@ int main() { // CHECK3-NEXT: ret i32 0 // CHECK3: terminate.lpad: // CHECK3-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: catch ptr null +// CHECK3-NEXT: catch ptr null // CHECK3-NEXT: [[TMP11:%.*]] = extractvalue { ptr, i32 } [[TMP10]], 0 -// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR8]], !llvm.access.group [[ACC_GRP9]] +// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR6]], !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: unreachable // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK3-SAME: () #[[ATTR5]] comdat personality ptr @__gxx_personality_v0 { +// CHECK3-SAME: () #[[ATTR4]] comdat personality ptr @__gxx_personality_v0 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 @@ -1693,7 +1686,7 @@ int main() { // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP15]] +// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP15]] // CHECK3: invoke.cont: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -1721,7 +1714,7 @@ int main() { // CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] // CHECK3-NEXT: store i32 [[ADD11]], ptr [[I6]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP18]] // CHECK3: invoke.cont12: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] // CHECK3: omp.body.continue13: @@ -1736,24 +1729,24 @@ int main() { // CHECK3-NEXT: ret i32 0 // CHECK3: terminate.lpad: // CHECK3-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: catch ptr null +// CHECK3-NEXT: catch ptr null // CHECK3-NEXT: [[TMP11:%.*]] = extractvalue { ptr, i32 } [[TMP10]], 0 -// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR8]], !llvm.access.group [[ACC_GRP15]] +// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR6]], !llvm.access.group [[ACC_GRP15]] // CHECK3-NEXT: unreachable // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6:[0-9]+]] comdat align 2 { +// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK3-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR7]] +// CHECK3-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] // CHECK3-NEXT: ret void // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { +// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -1767,7 +1760,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { +// CHECK3-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1794,7 +1787,7 @@ int main() { // CHECK5-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK5-NEXT: call void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[S]], i64 0) // CHECK5-NEXT: [[CALL:%.*]] = invoke signext i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[S]]) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK5-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -1827,16 +1820,16 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: lpad: // CHECK5-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: cleanup +// CHECK5-NEXT: cleanup // CHECK5-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK5-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK5-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK5-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK5-NEXT: br label [[EH_RESUME:%.*]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -1880,21 +1873,21 @@ int main() { // CHECK5-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK5-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK5-NEXT: [[CALL6:%.*]] = invoke i32 @_Z5tmainIcLi5EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont5: // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK5-NEXT: [[CALL8:%.*]] = invoke i32 @_Z5tmainI1SLi1EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont7: // CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK5-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK5-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK5-NEXT: ret i32 [[TMP41]] // CHECK5: eh.resume: @@ -1919,7 +1912,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1932,14 +1925,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK5-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR2:[0-9]+]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2007,7 +2000,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2062,7 +2055,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2086,21 +2079,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP13]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK5-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK5-SAME: (ptr [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK5-SAME: (i64 [[A:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (i64 [[A:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK5-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -2109,7 +2102,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2182,7 +2175,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2237,7 +2230,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2261,14 +2254,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP21]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK5-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -2304,7 +2297,7 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -2337,14 +2330,14 @@ int main() { // CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK5-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK5-SAME: () #[[ATTR7]] comdat { +// CHECK5-SAME: () #[[ATTR6]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -2380,7 +2373,7 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -2413,24 +2406,24 @@ int main() { // CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK5-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -2444,14 +2437,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2519,7 +2512,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2574,7 +2567,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2598,21 +2591,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP27]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2680,7 +2673,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2735,7 +2728,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2759,21 +2752,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP33]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2841,7 +2834,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2896,7 +2889,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2920,21 +2913,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP39]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK5-SAME: () #[[ATTR3]] { +// CHECK5-SAME: () #[[ATTR2]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2976,14 +2969,14 @@ int main() { // CHECK5-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK5: omp.inner.for.body: // CHECK5-NEXT: invoke void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 23) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] // CHECK5: invoke.cont: // CHECK5-NEXT: [[CALL:%.*]] = invoke signext i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK5-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] +// CHECK5-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] // CHECK5: invoke.cont2: // CHECK5-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK5-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]] -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP42]] // CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]] // CHECK5-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]] @@ -3010,14 +3003,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP16:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP42]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3072,7 +3065,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -3096,14 +3089,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP45]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -3111,13 +3104,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: @@ -3137,7 +3123,7 @@ int main() { // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK9-NEXT: call void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[S]], i64 0) // CHECK9-NEXT: [[CALL:%.*]] = invoke i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[S]]) -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK9: invoke.cont: // CHECK9-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -3170,16 +3156,16 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: lpad: // CHECK9-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: cleanup +// CHECK9-NEXT: cleanup // CHECK9-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK9-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK9-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK9-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK9-NEXT: br label [[EH_RESUME:%.*]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -3223,21 +3209,21 @@ int main() { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK9-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK9-NEXT: [[CALL6:%.*]] = invoke i32 @_Z5tmainIcLi5EEiv() -// CHECK9-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK9-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK9: invoke.cont5: // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK9-NEXT: [[CALL8:%.*]] = invoke i32 @_Z5tmainI1SLi1EEiv() -// CHECK9-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK9-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK9: invoke.cont7: // CHECK9-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK9-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK9-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP41]] // CHECK9: eh.resume: @@ -3262,7 +3248,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -3275,14 +3261,14 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK9-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: () #[[ATTR2:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3350,7 +3336,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3405,7 +3391,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3429,21 +3415,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP13]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK9-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK9-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK9-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK9-SAME: (ptr [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK9-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK9-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK9-SAME: (i64 [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (i64 [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -3452,7 +3438,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3525,7 +3511,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3580,7 +3566,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]] // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3604,14 +3590,14 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP21]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -3647,7 +3633,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -3680,14 +3666,14 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: ret i32 0 // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK9-SAME: () #[[ATTR7]] comdat { +// CHECK9-SAME: () #[[ATTR6]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -3723,7 +3709,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -3756,24 +3742,24 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: ret i32 0 // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK9-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -3787,7 +3773,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -3796,14 +3782,14 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3871,7 +3857,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3926,7 +3912,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]] // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -3950,21 +3936,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP27]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4032,7 +4018,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4087,7 +4073,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]] // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -4111,21 +4097,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP33]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4193,7 +4179,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4248,7 +4234,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]] // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -4272,21 +4258,21 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP39]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK9-SAME: () #[[ATTR3]] { +// CHECK9-SAME: () #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4328,14 +4314,14 @@ int main() { // CHECK9-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK9: omp.inner.for.body: // CHECK9-NEXT: invoke void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 23) -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] // CHECK9: invoke.cont: // CHECK9-NEXT: [[CALL:%.*]] = invoke i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK9-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] +// CHECK9-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] // CHECK9: invoke.cont2: // CHECK9-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK9-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]] -// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]] +// CHECK9-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP42]] // CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]] // CHECK9-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]] @@ -4362,14 +4348,14 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP16:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP42]] // CHECK9-NEXT: unreachable // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4424,7 +4410,7 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]] // CHECK9-NEXT: invoke void @_Z3foov() -// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] +// CHECK9-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] // CHECK9: invoke.cont: // CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK9: omp.body.continue: @@ -4448,19 +4434,12 @@ int main() { // CHECK9-NEXT: ret void // CHECK9: terminate.lpad: // CHECK9-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK9-NEXT: catch ptr null +// CHECK9-NEXT: catch ptr null // CHECK9-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]] +// CHECK9-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP45]] // CHECK9-NEXT: unreachable // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK11-NEXT: entry: @@ -4482,7 +4461,7 @@ int main() { // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK11-NEXT: call void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[S]], i64 0) // CHECK11-NEXT: [[CALL:%.*]] = invoke i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[S]]) -// CHECK11-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK11-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK11: invoke.cont: // CHECK11-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 @@ -4501,7 +4480,7 @@ int main() { // CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK11-NEXT: invoke void @_Z3foov() -// CHECK11-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP2]] +// CHECK11-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP2]] // CHECK11: invoke.cont1: // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK11: omp.body.continue: @@ -4513,12 +4492,12 @@ int main() { // CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK11: lpad: // CHECK11-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK11-NEXT: cleanup +// CHECK11-NEXT: cleanup // CHECK11-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0 // CHECK11-NEXT: store ptr [[TMP6]], ptr [[EXN_SLOT]], align 8 // CHECK11-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 1 // CHECK11-NEXT: store i32 [[TMP7]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK11-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR7:[0-9]+]] +// CHECK11-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5:[0-9]+]] // CHECK11-NEXT: br label [[EH_RESUME:%.*]] // CHECK11: omp.inner.for.end: // CHECK11-NEXT: store i32 100, ptr [[I]], align 4 @@ -4538,7 +4517,7 @@ int main() { // CHECK11-NEXT: [[ADD12:%.*]] = add nsw i32 0, [[MUL11]] // CHECK11-NEXT: store i32 [[ADD12]], ptr [[I7]], align 4, !llvm.access.group [[ACC_GRP6]] // CHECK11-NEXT: invoke void @_Z3foov() -// CHECK11-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP6]] +// CHECK11-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP6]] // CHECK11: invoke.cont13: // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] // CHECK11: omp.body.continue14: @@ -4553,15 +4532,15 @@ int main() { // CHECK11-NEXT: [[TMP13:%.*]] = load i8, ptr [[A]], align 1 // CHECK11-NEXT: [[CONV:%.*]] = sext i8 [[TMP13]] to i32 // CHECK11-NEXT: [[CALL19:%.*]] = invoke i32 @_Z5tmainIcLi5EEiv() -// CHECK11-NEXT: to label [[INVOKE_CONT18:%.*]] unwind label [[LPAD]] +// CHECK11-NEXT: to label [[INVOKE_CONT18:%.*]] unwind label [[LPAD]] // CHECK11: invoke.cont18: // CHECK11-NEXT: [[ADD20:%.*]] = add nsw i32 [[CONV]], [[CALL19]] // CHECK11-NEXT: [[CALL22:%.*]] = invoke i32 @_Z5tmainI1SLi1EEiv() -// CHECK11-NEXT: to label [[INVOKE_CONT21:%.*]] unwind label [[LPAD]] +// CHECK11-NEXT: to label [[INVOKE_CONT21:%.*]] unwind label [[LPAD]] // CHECK11: invoke.cont21: // CHECK11-NEXT: [[ADD23:%.*]] = add nsw i32 [[ADD20]], [[CALL22]] // CHECK11-NEXT: store i32 [[ADD23]], ptr [[RETVAL]], align 4 -// CHECK11-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR7]] +// CHECK11-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] // CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP14]] // CHECK11: eh.resume: @@ -4572,9 +4551,9 @@ int main() { // CHECK11-NEXT: resume { ptr, i32 } [[LPAD_VAL24]] // CHECK11: terminate.lpad: // CHECK11-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK11-NEXT: catch ptr null +// CHECK11-NEXT: catch ptr null // CHECK11-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 -// CHECK11-NEXT: call void @__clang_call_terminate(ptr [[TMP16]]) #[[ATTR8:[0-9]+]], !llvm.access.group [[ACC_GRP2]] +// CHECK11-NEXT: call void @__clang_call_terminate(ptr [[TMP16]]) #[[ATTR6:[0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK11-NEXT: unreachable // // @@ -4592,7 +4571,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4605,14 +4584,14 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK11-SAME: (ptr [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] comdat { -// CHECK11-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR7]] -// CHECK11-NEXT: call void @_ZSt9terminatev() #[[ATTR8]] +// CHECK11-SAME: (ptr [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] comdat { +// CHECK11-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZSt9terminatev() #[[ATTR6]] // CHECK11-NEXT: unreachable // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat personality ptr @__gxx_personality_v0 { +// CHECK11-SAME: () #[[ATTR4:[0-9]+]] comdat personality ptr @__gxx_personality_v0 { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 @@ -4640,7 +4619,7 @@ int main() { // CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK11-NEXT: invoke void @_Z3foov() -// CHECK11-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP9]] +// CHECK11-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP9]] // CHECK11: invoke.cont: // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK11: omp.body.continue: @@ -4668,7 +4647,7 @@ int main() { // CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] // CHECK11-NEXT: store i32 [[ADD11]], ptr [[I6]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK11-NEXT: invoke void @_Z3foov() -// CHECK11-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP12]] +// CHECK11-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP12]] // CHECK11: invoke.cont12: // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] // CHECK11: omp.body.continue13: @@ -4683,14 +4662,14 @@ int main() { // CHECK11-NEXT: ret i32 0 // CHECK11: terminate.lpad: // CHECK11-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } -// CHECK11-NEXT: catch ptr null +// CHECK11-NEXT: catch ptr null // CHECK11-NEXT: [[TMP11:%.*]] = extractvalue { ptr, i32 } [[TMP10]], 0 -// CHECK11-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR8]], !llvm.access.group [[ACC_GRP9]] +// CHECK11-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR6]], !llvm.access.group [[ACC_GRP9]] // CHECK11-NEXT: unreachable // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK11-SAME: () #[[ATTR5]] comdat personality ptr @__gxx_personality_v0 { +// CHECK11-SAME: () #[[ATTR4]] comdat personality ptr @__gxx_personality_v0 { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 @@ -4718,7 +4697,7 @@ int main() { // CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]] // CHECK11-NEXT: invoke void @_Z3foov() -// CHECK11-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP15]] +// CHECK11-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP15]] // CHECK11: invoke.cont: // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK11: omp.body.continue: @@ -4746,7 +4725,7 @@ int main() { // CHECK11-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] // CHECK11-NEXT: store i32 [[ADD11]], ptr [[I6]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK11-NEXT: invoke void @_Z3foov() -// CHECK11-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP18]] +// CHECK11-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP18]] // CHECK11: invoke.cont12: // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] // CHECK11: omp.body.continue13: @@ -4761,24 +4740,24 @@ int main() { // CHECK11-NEXT: ret i32 0 // CHECK11: terminate.lpad: // CHECK11-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } -// CHECK11-NEXT: catch ptr null +// CHECK11-NEXT: catch ptr null // CHECK11-NEXT: [[TMP11:%.*]] = extractvalue { ptr, i32 } [[TMP10]], 0 -// CHECK11-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR8]], !llvm.access.group [[ACC_GRP15]] +// CHECK11-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR6]], !llvm.access.group [[ACC_GRP15]] // CHECK11-NEXT: unreachable // // // CHECK11-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6:[0-9]+]] comdat align 2 { +// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK11-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR7]] +// CHECK11-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] // CHECK11-NEXT: ret void // // // CHECK11-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { +// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -4792,7 +4771,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { +// CHECK11-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4819,7 +4798,7 @@ int main() { // CHECK13-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK13-NEXT: call void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[S]], i64 0) // CHECK13-NEXT: [[CALL:%.*]] = invoke i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[S]]) -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK13: invoke.cont: // CHECK13-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK13-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -4852,16 +4831,16 @@ int main() { // CHECK13-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK13-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR3:[0-9]+]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: lpad: // CHECK13-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: cleanup +// CHECK13-NEXT: cleanup // CHECK13-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK13-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK13-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK13-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK13-NEXT: br label [[EH_RESUME:%.*]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -4905,21 +4884,21 @@ int main() { // CHECK13-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK13-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK13: omp_offload.failed3: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK13: omp_offload.cont4: // CHECK13-NEXT: [[TMP40:%.*]] = load i8, ptr [[A]], align 1 // CHECK13-NEXT: [[CONV:%.*]] = sext i8 [[TMP40]] to i32 // CHECK13-NEXT: [[CALL6:%.*]] = invoke i32 @_Z5tmainIcLi5EEiv() -// CHECK13-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK13-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK13: invoke.cont5: // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK13-NEXT: [[CALL8:%.*]] = invoke i32 @_Z5tmainI1SLi1EEiv() -// CHECK13-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK13-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK13: invoke.cont7: // CHECK13-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK13-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP41]] // CHECK13: eh.resume: @@ -4944,7 +4923,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1ScvcEv -// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 { +// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4957,14 +4936,14 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68 -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] { +// CHECK13-SAME: () #[[ATTR2:[0-9]+]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5032,7 +5011,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5087,7 +5066,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5111,21 +5090,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP13]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK13-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { -// CHECK13-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] -// CHECK13-NEXT: call void @_ZSt9terminatev() #[[ATTR10]] +// CHECK13-SAME: (ptr [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] comdat { +// CHECK13-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR3]] +// CHECK13-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK13-SAME: (i64 [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (i64 [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK13-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 @@ -5134,7 +5113,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5207,7 +5186,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5262,7 +5241,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]] // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5286,14 +5265,14 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP21]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv -// CHECK13-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR6:[0-9]+]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -5329,7 +5308,7 @@ int main() { // CHECK13-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK13-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -5362,14 +5341,14 @@ int main() { // CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK13-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK13: omp_offload.failed3: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK13: omp_offload.cont4: // CHECK13-NEXT: ret i32 0 // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv -// CHECK13-SAME: () #[[ATTR7]] comdat { +// CHECK13-SAME: () #[[ATTR6]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -5405,7 +5384,7 @@ int main() { // CHECK13-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK13-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -5438,24 +5417,24 @@ int main() { // CHECK13-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK13-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK13: omp_offload.failed3: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK13: omp_offload.cont4: // CHECK13-NEXT: ret i32 0 // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 { +// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -5469,14 +5448,14 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5544,7 +5523,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5599,7 +5578,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]] // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5623,21 +5602,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP27]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5705,7 +5684,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5760,7 +5739,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]] // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5784,21 +5763,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP33]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5866,7 +5845,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5921,7 +5900,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]] // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -5945,21 +5924,21 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP39]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57 -// CHECK13-SAME: () #[[ATTR3]] { +// CHECK13-SAME: () #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined) // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6001,14 +5980,14 @@ int main() { // CHECK13-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK13: omp.inner.for.body: // CHECK13-NEXT: invoke void @_ZN1SC1El(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 23) -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP42]] // CHECK13: invoke.cont: // CHECK13-NEXT: [[CALL:%.*]] = invoke i8 @_ZN1ScvcEv(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK13-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] +// CHECK13-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP42]] // CHECK13: invoke.cont2: // CHECK13-NEXT: [[TMP7:%.*]] = sext i8 [[CALL]] to i32 // CHECK13-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]] -// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]] +// CHECK13-NEXT: call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP42]] // CHECK13-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]] // CHECK13-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 // CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]] @@ -6035,14 +6014,14 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP16:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP42]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { +// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] personality ptr @__gxx_personality_v0 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6097,7 +6076,7 @@ int main() { // CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]] // CHECK13-NEXT: invoke void @_Z3foov() -// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] +// CHECK13-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] // CHECK13: invoke.cont: // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: @@ -6121,24 +6100,17 @@ int main() { // CHECK13-NEXT: ret void // CHECK13: terminate.lpad: // CHECK13-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK13-NEXT: catch ptr null +// CHECK13-NEXT: catch ptr null // CHECK13-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]] +// CHECK13-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP45]] // CHECK13-NEXT: unreachable // // // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 { +// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR6]] comdat align 2 { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: ret void // -// -// CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK13-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK13-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp index 4ebcfa73d1bd3..22208e2e2c1dc 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp @@ -331,13 +331,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -530,13 +523,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1302,13 +1288,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2036,13 +2015,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp index 9eeb1fc36a03e..d452ac3bed485 100644 --- a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp @@ -629,13 +629,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: diff --git a/clang/test/OpenMP/distribute_private_codegen.cpp b/clang/test/OpenMP/distribute_private_codegen.cpp index 5137fbdb3ebaf..8a47b15a24d24 100644 --- a/clang/test/OpenMP/distribute_private_codegen.cpp +++ b/clang/test/OpenMP/distribute_private_codegen.cpp @@ -218,13 +218,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -329,13 +322,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -661,7 +647,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -942,13 +928,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1272,7 +1251,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1550,10 +1529,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp index 2283cb0df0c08..16d909e329514 100644 --- a/clang/test/OpenMP/distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_codegen.cpp @@ -1129,13 +1129,6 @@ int fint(void) { return ftemplate(); } // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z23without_schedule_clausePfS_S_S_ // CHECK3-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2109,13 +2102,6 @@ int fint(void) { return ftemplate(); } // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z23without_schedule_clausePfS_S_S_ // CHECK5-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2470,7 +2456,7 @@ int fint(void) { return ftemplate(); } // CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7 // CHECK5-NEXT: [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]] // CHECK5-NEXT: store i32 [[SUB]], ptr [[I]], align 4 -// CHECK5-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal !15 +// CHECK5-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal [[META15:![0-9]+]] // CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM]] @@ -2487,7 +2473,7 @@ int fint(void) { return ftemplate(); } // CHECK5-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM5]] // CHECK5-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 // CHECK5-NEXT: [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP20]] -// CHECK5-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal !15 +// CHECK5-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal [[META15]] // CHECK5-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4 // CHECK5-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP22]] to i64 // CHECK5-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[IDXPROM8]] @@ -2904,7 +2890,7 @@ int fint(void) { return ftemplate(); } // CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]] // CHECK5-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8 -// CHECK5-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !15, !llvm.access.group [[ACC_GRP21]] +// CHECK5-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal [[META15]], !llvm.access.group [[ACC_GRP21]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: // CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -3132,13 +3118,6 @@ int fint(void) { return ftemplate(); } // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@_Z23without_schedule_clausePfS_S_S_ // CHECK7-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -3489,7 +3468,7 @@ int fint(void) { return ftemplate(); } // CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7 // CHECK7-NEXT: [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]] // CHECK7-NEXT: store i32 [[SUB]], ptr [[I]], align 4 -// CHECK7-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal !16 +// CHECK7-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal [[META16:![0-9]+]] // CHECK7-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP13]] // CHECK7-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 @@ -3503,7 +3482,7 @@ int fint(void) { return ftemplate(); } // CHECK7-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 [[TMP19]] // CHECK7-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK7-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP20]] -// CHECK7-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal !16 +// CHECK7-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal [[META16]] // CHECK7-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4 // CHECK7-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP22]] // CHECK7-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4 @@ -3915,7 +3894,7 @@ int fint(void) { return ftemplate(); } // CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK7-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]] // CHECK7-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8 -// CHECK7-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !16, !llvm.access.group [[ACC_GRP22]] +// CHECK7-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal [[META16]], !llvm.access.group [[ACC_GRP22]] // CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK7: omp.body.continue: // CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -4143,13 +4122,6 @@ int fint(void) { return ftemplate(); } // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z23without_schedule_clausePfS_S_S_ // CHECK9-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -4887,7 +4859,7 @@ int fint(void) { return ftemplate(); } // CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP3]], 7 // CHECK13-NEXT: [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]] // CHECK13-NEXT: store i32 [[SUB]], ptr [[I]], align 4 -// CHECK13-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nontemporal !7 +// CHECK13-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !nontemporal [[META7:![0-9]+]] // CHECK13-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4 // CHECK13-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP5]] to i64 // CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]] @@ -4904,7 +4876,7 @@ int fint(void) { return ftemplate(); } // CHECK13-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]] // CHECK13-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 // CHECK13-NEXT: [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]] -// CHECK13-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !nontemporal !7 +// CHECK13-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !nontemporal [[META7]] // CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4 // CHECK13-NEXT: [[IDXPROM7:%.*]] = sext i32 [[TMP14]] to i64 // CHECK13-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]] @@ -5045,7 +5017,7 @@ int fint(void) { return ftemplate(); } // CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 // CHECK13-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]] // CHECK13-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8 -// CHECK13-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !7, !llvm.access.group [[ACC_GRP13]] +// CHECK13-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal [[META7]], !llvm.access.group [[ACC_GRP13]] // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK13: omp.body.continue: // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -5239,7 +5211,7 @@ int fint(void) { return ftemplate(); } // CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP3]], 7 // CHECK15-NEXT: [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]] // CHECK15-NEXT: store i32 [[SUB]], ptr [[I]], align 4 -// CHECK15-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nontemporal !8 +// CHECK15-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !nontemporal [[META8:![0-9]+]] // CHECK15-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4 // CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]] // CHECK15-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4 @@ -5253,7 +5225,7 @@ int fint(void) { return ftemplate(); } // CHECK15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]] // CHECK15-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 // CHECK15-NEXT: [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]] -// CHECK15-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !nontemporal !8 +// CHECK15-NEXT: [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !nontemporal [[META8]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4 // CHECK15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]] // CHECK15-NEXT: store float [[MUL4]], ptr [[ARRAYIDX5]], align 4 @@ -5389,7 +5361,7 @@ int fint(void) { return ftemplate(); } // CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1 // CHECK15-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]] // CHECK15-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8 -// CHECK15-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !8, !llvm.access.group [[ACC_GRP14]] +// CHECK15-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal [[META8]], !llvm.access.group [[ACC_GRP14]] // CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK15: omp.body.continue: // CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -6870,7 +6842,7 @@ int fint(void) { return ftemplate(); } // CHECK21-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7 // CHECK21-NEXT: [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]] // CHECK21-NEXT: store i32 [[SUB]], ptr [[I]], align 4 -// CHECK21-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal !16 +// CHECK21-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal [[META16:![0-9]+]] // CHECK21-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK21-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64 // CHECK21-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM]] @@ -6887,7 +6859,7 @@ int fint(void) { return ftemplate(); } // CHECK21-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM5]] // CHECK21-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 // CHECK21-NEXT: [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP20]] -// CHECK21-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal !16 +// CHECK21-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal [[META16]] // CHECK21-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4 // CHECK21-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP22]] to i64 // CHECK21-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[IDXPROM8]] @@ -7142,7 +7114,7 @@ int fint(void) { return ftemplate(); } // CHECK21-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK21-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]] // CHECK21-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8 -// CHECK21-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !16, !llvm.access.group [[ACC_GRP22]] +// CHECK21-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal [[META16]], !llvm.access.group [[ACC_GRP22]] // CHECK21-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK21: omp.body.continue: // CHECK21-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -7497,7 +7469,7 @@ int fint(void) { return ftemplate(); } // CHECK23-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7 // CHECK23-NEXT: [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]] // CHECK23-NEXT: store i32 [[SUB]], ptr [[I]], align 4 -// CHECK23-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal !17 +// CHECK23-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal [[META17:![0-9]+]] // CHECK23-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4 // CHECK23-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP13]] // CHECK23-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 @@ -7511,7 +7483,7 @@ int fint(void) { return ftemplate(); } // CHECK23-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 [[TMP19]] // CHECK23-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 // CHECK23-NEXT: [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP20]] -// CHECK23-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal !17 +// CHECK23-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal [[META17]] // CHECK23-NEXT: [[TMP22:%.*]] = load i32, ptr [[I]], align 4 // CHECK23-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP22]] // CHECK23-NEXT: store float [[MUL5]], ptr [[ARRAYIDX6]], align 4 @@ -7761,7 +7733,7 @@ int fint(void) { return ftemplate(); } // CHECK23-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK23-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]] // CHECK23-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8 -// CHECK23-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !17, !llvm.access.group [[ACC_GRP23]] +// CHECK23-NEXT: store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal [[META17]], !llvm.access.group [[ACC_GRP23]] // CHECK23-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK23: omp.body.continue: // CHECK23-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp index c9083e3749232..e388393c37f18 100644 --- a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp @@ -314,13 +314,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -493,13 +486,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -854,7 +840,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1215,13 +1201,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1538,7 +1517,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1897,13 +1876,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -1983,11 +1955,11 @@ int main() { // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done7: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP14]] // @@ -2016,7 +1988,7 @@ int main() { // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2089,11 +2061,11 @@ int main() { // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done7: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP14]] // @@ -2104,7 +2076,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -2171,7 +2143,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -2286,11 +2258,11 @@ int main() { // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done6: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP14]] // @@ -2319,7 +2291,7 @@ int main() { // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2390,11 +2362,11 @@ int main() { // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_INNER_FOR_END]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done6: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP14]] // @@ -2405,7 +2377,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // @@ -2472,7 +2444,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp index 84806e722ebd5..5720c3aaaaff5 100644 --- a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp @@ -303,13 +303,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -479,13 +472,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -859,7 +845,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1238,13 +1224,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1580,7 +1559,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1957,13 +1936,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2077,14 +2049,14 @@ int main() { // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) // CHECK13-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK13-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR6]], i32 0, i32 0 // CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done15: @@ -2096,11 +2068,11 @@ int main() { // CHECK13: arraydestroy.body18: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST19:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE15]] ], [ [[ARRAYDESTROY_ELEMENT20:%.*]], [[ARRAYDESTROY_BODY18]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT20]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST19]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT20]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT20]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE21:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT20]], [[ARRAY_BEGIN17]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE21]], label [[ARRAYDESTROY_DONE22:%.*]], label [[ARRAYDESTROY_BODY18]] // CHECK13: arraydestroy.done22: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP20]] // @@ -2134,12 +2106,12 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2243,14 +2215,14 @@ int main() { // CHECK13: omp.arraycpy.done13: // CHECK13-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP8]], align 8 // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR6]], i32 0, i32 0 // CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done15: @@ -2261,11 +2233,11 @@ int main() { // CHECK13: arraydestroy.body17: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE15]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAY_BEGIN16]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17]] // CHECK13: arraydestroy.done21: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP19]] // @@ -2333,7 +2305,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -2482,14 +2454,14 @@ int main() { // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK15-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR6]], i32 0, i32 0 // CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done14: @@ -2501,11 +2473,11 @@ int main() { // CHECK15: arraydestroy.body17: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAY_BEGIN16]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17]] // CHECK15: arraydestroy.done21: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP20]] // @@ -2539,12 +2511,12 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2646,14 +2618,14 @@ int main() { // CHECK15: omp.arraycpy.done12: // CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP8]], align 4 // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR6]], i32 0, i32 0 // CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done14: @@ -2664,11 +2636,11 @@ int main() { // CHECK15: arraydestroy.body16: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK15: arraydestroy.done20: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP19]] // @@ -2736,7 +2708,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/distribute_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_simd_private_codegen.cpp index 3e00b98518c85..eac5bce7d5da8 100644 --- a/clang/test/OpenMP/distribute_simd_private_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_private_codegen.cpp @@ -226,13 +226,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -344,13 +337,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1023,13 +1009,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1672,13 +1651,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp index 30175edf1858c..31d276e984eee 100644 --- a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp @@ -379,13 +379,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -683,13 +676,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -991,13 +977,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/map_struct_ordering.cpp b/clang/test/OpenMP/map_struct_ordering.cpp index 035b39b5b12ab..dd0bb8e9d6e7c 100644 --- a/clang/test/OpenMP/map_struct_ordering.cpp +++ b/clang/test/OpenMP/map_struct_ordering.cpp @@ -163,10 +163,3 @@ int map_struct() { // CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX3]], align 4 // CHECK-NEXT: ret void // -// -// CHECK-LABEL: define internal void @.omp_offloading.requires_reg( -// CHECK-SAME: ) #[[ATTR5:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp index b3a4ab2e7e9e8..5d7da793e7326 100644 --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -814,13 +814,6 @@ int main(int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27 // CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -939,7 +932,7 @@ int main(int argc, char **argv) { // CHECK2-NEXT: [[TMP:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8 -// CHECK2-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK2-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8 // CHECK2-NEXT: [[_TMP4:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[B5:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[_TMP6:%.*]] = alloca ptr, align 8 @@ -975,17 +968,17 @@ int main(int argc, char **argv) { // CHECK2-NEXT: store i32 [[TMP9]], ptr [[C7]], align 4 // CHECK2-NEXT: store ptr [[C7]], ptr [[_TMP8]], align 8 // CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP4]], align 8 -// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP10]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 0 // CHECK2-NEXT: store ptr [[ARGC_ADDR]], ptr [[TMP11]], align 8 -// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP10]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 1 // CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8 // CHECK2-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP10]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 2 // CHECK2-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP8]], align 8 // CHECK2-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8 -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP10]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 3 // CHECK2-NEXT: store ptr [[D_ADDR]], ptr [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP10]], i32 0, i32 4 +// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 4 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[TMP17]], align 8 // CHECK2-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP4]], align 8 // CHECK2-NEXT: [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP18]]) #[[ATTR7]] @@ -1066,7 +1059,7 @@ int main(int argc, char **argv) { // CHECK2-NEXT: [[TMP:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8 -// CHECK2-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK2-NEXT: [[L3:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8 // CHECK2-NEXT: [[_TMP4:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[ARGC5:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[B6:%.*]] = alloca i32, align 4 @@ -1106,17 +1099,17 @@ int main(int argc, char **argv) { // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK2-NEXT: store i32 [[TMP11]], ptr [[A10]], align 4 // CHECK2-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP4]], align 8 -// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP12]], i32 0, i32 0 +// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 0 // CHECK2-NEXT: store ptr [[ARGC5]], ptr [[TMP13]], align 8 -// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP12]], i32 0, i32 1 +// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 1 // CHECK2-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8 // CHECK2-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8 -// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP12]], i32 0, i32 2 +// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 2 // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP9]], align 8 // CHECK2-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 8 -// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP12]], i32 0, i32 3 +// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 3 // CHECK2-NEXT: store ptr [[D_ADDR]], ptr [[TMP18]], align 8 -// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP12]], i32 0, i32 4 +// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 4 // CHECK2-NEXT: store ptr [[A10]], ptr [[TMP19]], align 8 // CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[_TMP4]], align 8 // CHECK2-NEXT: [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP20]]) #[[ATTR7]] @@ -1376,7 +1369,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[L_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK3-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK3-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8 // CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1392,7 +1385,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP3]], i64 8, i1 false) // CHECK3-NEXT: store ptr [[L1]], ptr [[_TMP2]], align 8 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP2]], align 8 -// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP4]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP5]], align 8 // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[_TMP2]], align 8 // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP6]]) #[[ATTR7]] @@ -1408,7 +1401,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_0:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_1:%.*]], ptr [[THIS1]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP1]], i32 0, i32 0 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 @@ -1454,7 +1447,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[L_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK3-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK3-NEXT: [[L1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8 // CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 @@ -1467,7 +1460,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP2]], i64 8, i1 false) // CHECK3-NEXT: store ptr [[L1]], ptr [[_TMP2]], align 8 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[_TMP2]], align 8 -// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[TMP3]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP3]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP4]], align 8 // CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP2]], align 8 // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR7]] @@ -1507,7 +1500,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[T_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK3-NEXT: [[T1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8 +// CHECK3-NEXT: [[T1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8 // CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 diff --git a/clang/test/OpenMP/openmp_offload_registration.cpp b/clang/test/OpenMP/openmp_offload_registration.cpp index 4d2a15d3a258f..aff8d431650dc 100644 --- a/clang/test/OpenMP/openmp_offload_registration.cpp +++ b/clang/test/OpenMP/openmp_offload_registration.cpp @@ -10,15 +10,6 @@ void foo(void) { // CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 } -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - // Check presence of foo() and the outlined target region // CHECK: define{{.*}} void [[FOO:@.+]]() // CHECK: define internal void [[OUTLINEDTARGET:@.+]]() - -// Check registration and unregistration code. - -// CHECK: define internal void @.omp_offloading.requires_reg() -// CHECK: call void @__tgt_register_requires(i64 1) -// CHECK: ret void diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp index 262998e149ed2..d47c6ec7214df 100644 --- a/clang/test/OpenMP/reduction_implicit_map.cpp +++ b/clang/test/OpenMP/reduction_implicit_map.cpp @@ -347,7 +347,7 @@ int main() // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50(ptr [[O]]) #[[ATTR7:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50(ptr [[O]]) #[[ATTR6:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -388,14 +388,14 @@ int main() // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]] // CHECK1: omp_offload.failed5: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55(ptr [[B]]) #[[ATTR7]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55(ptr [[B]]) #[[ATTR6]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT6]] // CHECK1: omp_offload.cont6: // CHECK1-NEXT: ret i32 0 // // // CHECK1-LABEL: define {{[^@]+}}@_ZN2S2C1Ev -// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -405,7 +405,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50 -// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(20) [[O:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(20) [[O:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[O_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[O]], ptr [[O_ADDR]], align 8 @@ -415,7 +415,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(20) [[O:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(20) [[O:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -475,7 +475,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -493,7 +493,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55 -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8000) [[B:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8000) [[B:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 @@ -503,7 +503,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8000) [[B:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8000) [[B:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -696,7 +696,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@.red_init. -// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -719,7 +719,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@.red_comb. -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -748,7 +748,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -782,7 +782,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@_ZN2S2C2Ev -// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -793,7 +793,7 @@ int main() // // // CHECK1-LABEL: define {{[^@]+}}@main -// CHECK1-SAME: () #[[ATTR10:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR9:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -803,13 +803,6 @@ int main() // CHECK1-NEXT: ret i32 0 // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR11:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@_Z3sumPiiS_ // CHECK2-SAME: (ptr noundef [[INPUT:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -2100,17 +2093,10 @@ int main() // CHECK2-NEXT: [[RESULT:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK2-NEXT: store i32 100, ptr [[SIZE]], align 4 -// CHECK2-NEXT: [[CALL:%.*]] = call noalias noundef nonnull ptr @_Znaj(i32 noundef 400) #[[ATTR9:[0-9]+]] +// CHECK2-NEXT: [[CALL:%.*]] = call noalias noundef nonnull ptr @_Znaj(i32 noundef 400) #[[ATTR8:[0-9]+]] // CHECK2-NEXT: store ptr [[CALL]], ptr [[ARRAY]], align 4 // CHECK2-NEXT: store i32 0, ptr [[RESULT]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY]], align 4 // CHECK2-NEXT: call void @_Z3sumPiiS_(ptr noundef [[TMP0]], i32 noundef 100, ptr noundef [[RESULT]]) // CHECK2-NEXT: ret i32 0 // -// -// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK2-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK2-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_codegen.cpp b/clang/test/OpenMP/target_codegen.cpp index fa96d5f00920b..34a6c07852656 100644 --- a/clang/test/OpenMP/target_codegen.cpp +++ b/clang/test/OpenMP/target_codegen.cpp @@ -114,10 +114,6 @@ // TCHECK: @{{.+}} = weak constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; @@ -890,10 +886,6 @@ void thread_limit_target(int TargetTL, int TeamsTL) { // OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, -// CHECK: define internal void @.omp_offloading.requires_reg() -// CHECK: call void @__tgt_register_requires(i64 1) -// CHECK: ret void - int main () { S2 bar; bar.zee(); diff --git a/clang/test/OpenMP/target_codegen_global_capture.cpp b/clang/test/OpenMP/target_codegen_global_capture.cpp index 4237914c4551a..0fb52c3fa6c55 100644 --- a/clang/test/OpenMP/target_codegen_global_capture.cpp +++ b/clang/test/OpenMP/target_codegen_global_capture.cpp @@ -1074,13 +1074,6 @@ int tbar2(short a, short b, short c, short d){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3foossss // CHECK3-SAME: (i16 noundef signext [[A:%.*]], i16 noundef signext [[B:%.*]], i16 noundef signext [[C:%.*]], i16 noundef signext [[D:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1988,10 +1981,3 @@ int tbar2(short a, short b, short c, short d){ // CHECK3-NEXT: store float [[CONV5]], ptr [[TMP2]], align 4 // CHECK3-NEXT: ret void // -// -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_codegen_registration.cpp b/clang/test/OpenMP/target_codegen_registration.cpp index 1041c8f5dfec7..5313da30c4ecf 100644 --- a/clang/test/OpenMP/target_codegen_registration.cpp +++ b/clang/test/OpenMP/target_codegen_registration.cpp @@ -169,11 +169,10 @@ // TCHECK-DAG: @.omp_offloading.entry.[[NAME12]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null } +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ @@ -343,7 +342,6 @@ struct ST { //TCHECK-DAG: define weak{{.*}} void @[[NAME12]]( // CHECK-NTARGET-NOT: __tgt_target -// CHECK-NTARGET-NOT: __tgt_register_requires // TCHECK-NOT: __tgt_target @@ -403,31 +401,31 @@ int bar(int a){ // Check metadata is properly generated: // CHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 205, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 255, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 271, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 277, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 288, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 294, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 398, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 300, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 294, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 300, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 288, i32 0, i32 {{[0-9]+}}} -// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}} +// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 0, i32 {{[0-9]+}}} // TCHECK: !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 205, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 255, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 271, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 277, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 288, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 294, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 398, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 300, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 294, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 300, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 288, i32 0, i32 {{[0-9]+}}} -// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 230, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}} +// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 0, i32 {{[0-9]+}}} #endif diff --git a/clang/test/OpenMP/target_depend_codegen.cpp b/clang/test/OpenMP/target_depend_codegen.cpp index 83430dd97b073..65bd7279a10b1 100644 --- a/clang/test/OpenMP/target_depend_codegen.cpp +++ b/clang/test/OpenMP/target_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; @@ -267,8 +263,4 @@ int foo(int n) { // CHECK: call void [[HVT2]](i[[SZ]] [[BP1]]) // CHECK: ret i32 0 -// CHECK: define internal void @.omp_offloading.requires_reg() -// CHECK: call void @__tgt_register_requires(i64 1) -// CHECK: ret void - #endif diff --git a/clang/test/OpenMP/target_firstprivate_codegen.cpp b/clang/test/OpenMP/target_firstprivate_codegen.cpp index bd9874fcae6e3..6314940730470 100644 --- a/clang/test/OpenMP/target_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_firstprivate_codegen.cpp @@ -6849,13 +6849,6 @@ int bar(int n, double *ptr) { // CHECK0-NEXT: ret void // // -// CHECK0-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK0-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK0-NEXT: entry: -// CHECK0-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK0-NEXT: ret void -// -// // CHECK1-LABEL: define {{[^@]+}}@_Z3fooiPd // CHECK1-SAME: (i32 noundef signext [[N:%.*]], ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK1-NEXT: entry: @@ -7627,13 +7620,6 @@ int bar(int n, double *ptr) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@_Z3fooiPd // CHECK2-SAME: (i32 noundef [[N:%.*]], ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -8405,13 +8391,6 @@ int bar(int n, double *ptr) { // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK2-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK2-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3fooiPd // CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -9183,13 +9162,6 @@ int bar(int n, double *ptr) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z3fooiPd // SIMD-ONLY0-SAME: (i32 noundef signext [[N:%.*]], ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { // SIMD-ONLY0-NEXT: entry: diff --git a/clang/test/OpenMP/target_has_device_addr_codegen.cpp b/clang/test/OpenMP/target_has_device_addr_codegen.cpp index 05b4876e8f90f..e6a0e7bb38d64 100644 --- a/clang/test/OpenMP/target_has_device_addr_codegen.cpp +++ b/clang/test/OpenMP/target_has_device_addr_codegen.cpp @@ -661,7 +661,7 @@ void use_template() { // // // CHECK-LABEL: define {{[^@]+}}@_Z5tmainIiET_S0_ -// CHECK-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR1]] comdat { // CHECK-NEXT: entry: // CHECK-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[DA:%.*]] = alloca [5 x i32], align 4 @@ -867,7 +867,7 @@ void use_template() { // // // CHECK-LABEL: define {{[^@]+}}@_Z5tmainIPiET_S1_ -// CHECK-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR6]] comdat { +// CHECK-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR1]] comdat { // CHECK-NEXT: entry: // CHECK-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DA:%.*]] = alloca [5 x ptr], align 8 @@ -1181,7 +1181,7 @@ void use_template() { // // // CHECK-LABEL: define {{[^@]+}}@_Z12use_templatev -// CHECK-SAME: () #[[ATTR6]] { +// CHECK-SAME: () #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[AKERN:%.*]] = alloca [[STRUCT_SOMEKERNEL:%.*]], align 4 // CHECK-NEXT: call void @_ZN10SomeKernelC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AKERN]]) @@ -1191,7 +1191,7 @@ void use_template() { // // // CHECK-LABEL: define {{[^@]+}}@_ZN10SomeKernel5applyILj32EEEvv -// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat { +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 @@ -1322,13 +1322,6 @@ void use_template() { // CHECK-NEXT: ret ptr [[TMP1]] // // -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// -// // SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init // SIMD-ONLY0-SAME: () #[[ATTR0:[0-9]+]] { // SIMD-ONLY0-NEXT: entry: @@ -1463,7 +1456,7 @@ void use_template() { // // // SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z5tmainIiET_S0_ -// SIMD-ONLY0-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR4:[0-9]+]] comdat { +// SIMD-ONLY0-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY0-NEXT: entry: // SIMD-ONLY0-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // SIMD-ONLY0-NEXT: [[DA:%.*]] = alloca [5 x i32], align 4 @@ -1505,7 +1498,7 @@ void use_template() { // // // SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z5tmainIPiET_S1_ -// SIMD-ONLY0-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR4]] comdat { +// SIMD-ONLY0-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY0-NEXT: entry: // SIMD-ONLY0-NEXT: [[ARGC_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY0-NEXT: [[DA:%.*]] = alloca [5 x ptr], align 8 @@ -1547,17 +1540,17 @@ void use_template() { // // // SIMD-ONLY0-LABEL: define {{[^@]+}}@_Z12use_templatev -// SIMD-ONLY0-SAME: () #[[ATTR4]] { +// SIMD-ONLY0-SAME: () #[[ATTR1]] { // SIMD-ONLY0-NEXT: entry: // SIMD-ONLY0-NEXT: [[AKERN:%.*]] = alloca [[STRUCT_SOMEKERNEL:%.*]], align 4 // SIMD-ONLY0-NEXT: call void @_ZN10SomeKernelC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AKERN]]) // SIMD-ONLY0-NEXT: call void @_ZN10SomeKernel5applyILj32EEEvv(ptr noundef nonnull align 4 dereferenceable(8) [[AKERN]]) -// SIMD-ONLY0-NEXT: call void @_ZN10SomeKernelD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AKERN]]) #[[ATTR7:[0-9]+]] +// SIMD-ONLY0-NEXT: call void @_ZN10SomeKernelD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AKERN]]) #[[ATTR6:[0-9]+]] // SIMD-ONLY0-NEXT: ret void // // // SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN10SomeKernel5applyILj32EEEvv -// SIMD-ONLY0-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR4]] comdat { +// SIMD-ONLY0-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY0-NEXT: entry: // SIMD-ONLY0-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY0-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 diff --git a/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp b/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp index 3fc22111a8b41..efd0a1ee88fe2 100644 --- a/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp +++ b/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp @@ -131,7 +131,7 @@ int main() { // CHECK-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0 // CHECK-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK: omp_offload.failed: -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27(ptr [[A]], ptr [[TMP4]], ptr [[TMP5]], ptr [[ARR]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27(ptr [[A]], ptr [[TMP4]], ptr [[TMP5]], ptr [[ARR]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] // CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK: omp_offload.cont: // CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[A]], align 4 @@ -154,7 +154,7 @@ int main() { // // // CHECK-LABEL: define {{[^@]+}}@_ZN1S3fooEv -// CHECK-SAME: (ptr noundef nonnull align 8 dereferenceable(40) [[THIS:%.*]]) #[[ATTR3:[0-9]+]] comdat { +// CHECK-SAME: (ptr noundef nonnull align 8 dereferenceable(40) [[THIS:%.*]]) #[[ATTR2]] comdat { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x ptr], align 8 @@ -240,14 +240,14 @@ int main() { // CHECK-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK: omp_offload.failed: -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l14(ptr [[THIS1]]) #[[ATTR5]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l14(ptr [[THIS1]]) #[[ATTR4]] // CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK: omp_offload.cont: // CHECK-NEXT: ret void // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27 -// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[REF:%.*]], ptr noundef nonnull align 4 dereferenceable(16) [[ARR:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VLA1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[REF:%.*]], ptr noundef nonnull align 4 dereferenceable(16) [[ARR:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VLA1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[PTR_ADDR:%.*]] = alloca ptr, align 8 @@ -308,7 +308,7 @@ int main() { // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l14 -// CHECK-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR4]] { +// CHECK-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -335,13 +335,6 @@ int main() { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// -// // SIMD-ONLY0-LABEL: define {{[^@]+}}@main // SIMD-ONLY0-SAME: () #[[ATTR0:[0-9]+]] { // SIMD-ONLY0-NEXT: entry: @@ -409,7 +402,7 @@ int main() { // // // SIMD-ONLY0-LABEL: define {{[^@]+}}@_ZN1S3fooEv -// SIMD-ONLY0-SAME: (ptr noundef nonnull align 8 dereferenceable(40) [[THIS:%.*]]) #[[ATTR3:[0-9]+]] comdat { +// SIMD-ONLY0-SAME: (ptr noundef nonnull align 8 dereferenceable(40) [[THIS:%.*]]) #[[ATTR2]] comdat { // SIMD-ONLY0-NEXT: entry: // SIMD-ONLY0-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY0-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp index 6a5d5608e0fa8..162f18529a489 100644 --- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp +++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp @@ -2233,13 +2233,6 @@ void bar() { // CK10-NEXT: ret void // // -// CK10-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK10-SAME: () #[[ATTR3:[0-9]+]] { -// CK10-NEXT: entry: -// CK10-NEXT: call void @__tgt_register_requires(i64 1) -// CK10-NEXT: ret void -// -// // CK11-LABEL: define {{[^@]+}}@_Z3barRPfRPi // CK11-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CK11-NEXT: entry: @@ -2711,13 +2704,6 @@ void bar() { // CK11-NEXT: ret void // // -// CK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK11-SAME: () #[[ATTR3:[0-9]+]] { -// CK11-NEXT: entry: -// CK11-NEXT: call void @__tgt_register_requires(i64 1) -// CK11-NEXT: ret void -// -// // CK12-LABEL: define {{[^@]+}}@_Z3barRPfRPi // CK12-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CK12-NEXT: entry: @@ -3189,13 +3175,6 @@ void bar() { // CK12-NEXT: ret void // // -// CK12-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK12-SAME: () #[[ATTR3:[0-9]+]] { -// CK12-NEXT: entry: -// CK12-NEXT: call void @__tgt_register_requires(i64 1) -// CK12-NEXT: ret void -// -// // CK13-LABEL: define {{[^@]+}}@_Z3barRPfRPi // CK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CK13-NEXT: entry: @@ -3667,13 +3646,6 @@ void bar() { // CK13-NEXT: ret void // // -// CK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK13-SAME: () #[[ATTR3:[0-9]+]] { -// CK13-NEXT: entry: -// CK13-NEXT: call void @__tgt_register_requires(i64 1) -// CK13-NEXT: ret void -// -// // SIMD-ONLY00-LABEL: define {{[^@]+}}@_Z3barRPfRPi // SIMD-ONLY00-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) #[[ATTR0:[0-9]+]] { // SIMD-ONLY00-NEXT: entry: @@ -3985,7 +3957,7 @@ void bar() { // // // CK20-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat { +// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CK20-NEXT: entry: // CK20-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -4060,7 +4032,7 @@ void bar() { // CK20-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CK20-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CK20: omp_offload.failed: -// CK20-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CK20-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CK20-NEXT: br label [[OMP_OFFLOAD_CONT]] // CK20: omp_offload.cont: // CK20-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 @@ -4118,7 +4090,7 @@ void bar() { // CK20-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CK20-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CK20: omp_offload.failed6: -// CK20-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]] +// CK20-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR2]] // CK20-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CK20: omp_offload.cont7: // CK20-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 @@ -4183,14 +4155,14 @@ void bar() { // CK20-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CK20-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]] // CK20: omp_offload.failed15: -// CK20-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]] +// CK20-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR2]] // CK20-NEXT: br label [[OMP_OFFLOAD_CONT16]] // CK20: omp_offload.cont16: // CK20-NEXT: ret void // // // CK20-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat { +// CK20-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CK20-NEXT: entry: // CK20-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -4206,7 +4178,7 @@ void bar() { // // // CK20-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112 -// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] { // CK20-NEXT: entry: // CK20-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK20-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4219,7 +4191,7 @@ void bar() { // // // CK20-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118 -// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK20-NEXT: entry: // CK20-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK20-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4233,7 +4205,7 @@ void bar() { // // // CK20-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125 -// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK20-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK20-NEXT: entry: // CK20-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK20-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4250,13 +4222,6 @@ void bar() { // CK20-NEXT: ret void // // -// CK20-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK20-SAME: () #[[ATTR5:[0-9]+]] { -// CK20-NEXT: entry: -// CK20-NEXT: call void @__tgt_register_requires(i64 1) -// CK20-NEXT: ret void -// -// // CK21-LABEL: define {{[^@]+}}@_Z3barPd // CK21-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { // CK21-NEXT: entry: @@ -4272,7 +4237,7 @@ void bar() { // // // CK21-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat { +// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CK21-NEXT: entry: // CK21-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK21-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -4347,7 +4312,7 @@ void bar() { // CK21-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CK21-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CK21: omp_offload.failed: -// CK21-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CK21-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CK21-NEXT: br label [[OMP_OFFLOAD_CONT]] // CK21: omp_offload.cont: // CK21-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 @@ -4405,7 +4370,7 @@ void bar() { // CK21-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CK21-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CK21: omp_offload.failed6: -// CK21-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]] +// CK21-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR2]] // CK21-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CK21: omp_offload.cont7: // CK21-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 @@ -4470,14 +4435,14 @@ void bar() { // CK21-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CK21-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]] // CK21: omp_offload.failed15: -// CK21-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]] +// CK21-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR2]] // CK21-NEXT: br label [[OMP_OFFLOAD_CONT16]] // CK21: omp_offload.cont16: // CK21-NEXT: ret void // // // CK21-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat { +// CK21-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CK21-NEXT: entry: // CK21-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK21-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -4493,7 +4458,7 @@ void bar() { // // // CK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112 -// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] { // CK21-NEXT: entry: // CK21-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK21-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4506,7 +4471,7 @@ void bar() { // // // CK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118 -// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK21-NEXT: entry: // CK21-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK21-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4520,7 +4485,7 @@ void bar() { // // // CK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125 -// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK21-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK21-NEXT: entry: // CK21-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CK21-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -4537,13 +4502,6 @@ void bar() { // CK21-NEXT: ret void // // -// CK21-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK21-SAME: () #[[ATTR5:[0-9]+]] { -// CK21-NEXT: entry: -// CK21-NEXT: call void @__tgt_register_requires(i64 1) -// CK21-NEXT: ret void -// -// // CK22-LABEL: define {{[^@]+}}@_Z3barPd // CK22-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { // CK22-NEXT: entry: @@ -4559,7 +4517,7 @@ void bar() { // // // CK22-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { +// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CK22-NEXT: entry: // CK22-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK22-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -4634,7 +4592,7 @@ void bar() { // CK22-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CK22-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CK22: omp_offload.failed: -// CK22-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CK22-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CK22-NEXT: br label [[OMP_OFFLOAD_CONT]] // CK22: omp_offload.cont: // CK22-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 @@ -4692,7 +4650,7 @@ void bar() { // CK22-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CK22-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CK22: omp_offload.failed6: -// CK22-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]] +// CK22-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR2]] // CK22-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CK22: omp_offload.cont7: // CK22-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 @@ -4757,14 +4715,14 @@ void bar() { // CK22-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CK22-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]] // CK22: omp_offload.failed15: -// CK22-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]] +// CK22-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR2]] // CK22-NEXT: br label [[OMP_OFFLOAD_CONT16]] // CK22: omp_offload.cont16: // CK22-NEXT: ret void // // // CK22-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// CK22-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CK22-NEXT: entry: // CK22-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK22-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -4780,7 +4738,7 @@ void bar() { // // // CK22-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112 -// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] { // CK22-NEXT: entry: // CK22-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK22-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -4793,7 +4751,7 @@ void bar() { // // // CK22-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118 -// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK22-NEXT: entry: // CK22-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK22-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -4807,7 +4765,7 @@ void bar() { // // // CK22-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125 -// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK22-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK22-NEXT: entry: // CK22-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK22-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -4824,13 +4782,6 @@ void bar() { // CK22-NEXT: ret void // // -// CK22-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK22-SAME: () #[[ATTR5:[0-9]+]] { -// CK22-NEXT: entry: -// CK22-NEXT: call void @__tgt_register_requires(i64 1) -// CK22-NEXT: ret void -// -// // CK23-LABEL: define {{[^@]+}}@_Z3barPd // CK23-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { // CK23-NEXT: entry: @@ -4846,7 +4797,7 @@ void bar() { // // // CK23-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { +// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CK23-NEXT: entry: // CK23-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK23-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -4921,7 +4872,7 @@ void bar() { // CK23-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CK23-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CK23: omp_offload.failed: -// CK23-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CK23-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CK23-NEXT: br label [[OMP_OFFLOAD_CONT]] // CK23: omp_offload.cont: // CK23-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1 @@ -4979,7 +4930,7 @@ void bar() { // CK23-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CK23-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CK23: omp_offload.failed6: -// CK23-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR3]] +// CK23-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118(ptr [[THIS1]]) #[[ATTR2]] // CK23-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CK23: omp_offload.cont7: // CK23-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0 @@ -5044,14 +4995,14 @@ void bar() { // CK23-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CK23-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]] // CK23: omp_offload.failed15: -// CK23-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR3]] +// CK23-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125(ptr [[THIS1]]) #[[ATTR2]] // CK23-NEXT: br label [[OMP_OFFLOAD_CONT16]] // CK23: omp_offload.cont16: // CK23-NEXT: ret void // // // CK23-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// CK23-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CK23-NEXT: entry: // CK23-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK23-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -5067,7 +5018,7 @@ void bar() { // // // CK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l112 -// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] { // CK23-NEXT: entry: // CK23-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK23-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -5080,7 +5031,7 @@ void bar() { // // // CK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l118 -// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK23-NEXT: entry: // CK23-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK23-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -5094,7 +5045,7 @@ void bar() { // // // CK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STIdE3fooERPd_l125 -// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CK23-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CK23-NEXT: entry: // CK23-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CK23-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -5111,13 +5062,6 @@ void bar() { // CK23-NEXT: ret void // // -// CK23-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK23-SAME: () #[[ATTR5:[0-9]+]] { -// CK23-NEXT: entry: -// CK23-NEXT: call void @__tgt_register_requires(i64 1) -// CK23-NEXT: ret void -// -// // SIMD-ONLY10-LABEL: define {{[^@]+}}@_Z3barPd // SIMD-ONLY10-SAME: (ptr noundef [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { // SIMD-ONLY10-NEXT: entry: @@ -5133,7 +5077,7 @@ void bar() { // // // SIMD-ONLY10-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat { +// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // SIMD-ONLY10-NEXT: entry: // SIMD-ONLY10-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY10-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -5177,7 +5121,7 @@ void bar() { // // // SIMD-ONLY10-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat { +// SIMD-ONLY10-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // SIMD-ONLY10-NEXT: entry: // SIMD-ONLY10-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY10-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -5207,7 +5151,7 @@ void bar() { // // // SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat { +// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // SIMD-ONLY11-NEXT: entry: // SIMD-ONLY11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY11-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -5251,7 +5195,7 @@ void bar() { // // // SIMD-ONLY11-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat { +// SIMD-ONLY11-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat { // SIMD-ONLY11-NEXT: entry: // SIMD-ONLY11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY11-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 @@ -5281,7 +5225,7 @@ void bar() { // // // SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { +// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // SIMD-ONLY12-NEXT: entry: // SIMD-ONLY12-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // SIMD-ONLY12-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -5325,7 +5269,7 @@ void bar() { // // // SIMD-ONLY12-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// SIMD-ONLY12-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // SIMD-ONLY12-NEXT: entry: // SIMD-ONLY12-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // SIMD-ONLY12-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -5355,7 +5299,7 @@ void bar() { // // // SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZN2STIdEC1ERPd -// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { +// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // SIMD-ONLY13-NEXT: entry: // SIMD-ONLY13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // SIMD-ONLY13-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -5399,7 +5343,7 @@ void bar() { // // // SIMD-ONLY13-LABEL: define {{[^@]+}}@_ZN2STIdEC2ERPd -// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// SIMD-ONLY13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // SIMD-ONLY13-NEXT: entry: // SIMD-ONLY13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // SIMD-ONLY13-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4 @@ -5477,13 +5421,6 @@ void bar() { // CK30-NEXT: ret void // // -// CK30-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK30-SAME: () #[[ATTR3:[0-9]+]] { -// CK30-NEXT: entry: -// CK30-NEXT: call void @__tgt_register_requires(i64 1) -// CK30-NEXT: ret void -// -// // CK31-LABEL: define {{[^@]+}}@_Z3barv // CK31-SAME: () #[[ATTR0:[0-9]+]] { // CK31-NEXT: entry: @@ -5547,13 +5484,6 @@ void bar() { // CK31-NEXT: ret void // // -// CK31-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK31-SAME: () #[[ATTR3:[0-9]+]] { -// CK31-NEXT: entry: -// CK31-NEXT: call void @__tgt_register_requires(i64 1) -// CK31-NEXT: ret void -// -// // CK32-LABEL: define {{[^@]+}}@_Z3barv // CK32-SAME: () #[[ATTR0:[0-9]+]] { // CK32-NEXT: entry: @@ -5617,13 +5547,6 @@ void bar() { // CK32-NEXT: ret void // // -// CK32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK32-SAME: () #[[ATTR3:[0-9]+]] { -// CK32-NEXT: entry: -// CK32-NEXT: call void @__tgt_register_requires(i64 1) -// CK32-NEXT: ret void -// -// // CK33-LABEL: define {{[^@]+}}@_Z3barv // CK33-SAME: () #[[ATTR0:[0-9]+]] { // CK33-NEXT: entry: @@ -5687,13 +5610,6 @@ void bar() { // CK33-NEXT: ret void // // -// CK33-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CK33-SAME: () #[[ATTR3:[0-9]+]] { -// CK33-NEXT: entry: -// CK33-NEXT: call void @__tgt_register_requires(i64 1) -// CK33-NEXT: ret void -// -// // SIMD-ONLY20-LABEL: define {{[^@]+}}@_Z3barv // SIMD-ONLY20-SAME: () #[[ATTR0:[0-9]+]] { // SIMD-ONLY20-NEXT: entry: diff --git a/clang/test/OpenMP/target_map_codegen_03.cpp b/clang/test/OpenMP/target_map_codegen_03.cpp index b4fd0864c75c1..cd28e7bf848e0 100644 --- a/clang/test/OpenMP/target_map_codegen_03.cpp +++ b/clang/test/OpenMP/target_map_codegen_03.cpp @@ -156,13 +156,6 @@ void implicit_maps_nested_integer (int a){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z28implicit_maps_nested_integeri // CHECK3-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -261,10 +254,3 @@ void implicit_maps_nested_integer (int a){ // CHECK3-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 // CHECK3-NEXT: ret void // -// -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_map_codegen_hold.cpp b/clang/test/OpenMP/target_map_codegen_hold.cpp index a0732d5608a9a..81306ccb4cf55 100644 --- a/clang/test/OpenMP/target_map_codegen_hold.cpp +++ b/clang/test/OpenMP/target_map_codegen_hold.cpp @@ -462,13 +462,6 @@ void ST::test_present_members() { // CHECK-USE-PPC64LE-NEXT: ret void // // -// CHECK-USE-PPC64LE-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-USE-PPC64LE-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK-USE-PPC64LE-NEXT: entry: -// CHECK-USE-PPC64LE-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-USE-PPC64LE-NEXT: ret void -// -// // CHECK-USE-I386-LABEL: define {{[^@]+}}@_Z20explicit_maps_singlei // CHECK-USE-I386-SAME: (i32 noundef [[II:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-USE-I386-NEXT: entry: @@ -770,13 +763,6 @@ void ST::test_present_members() { // CHECK-USE-I386-NEXT: ret void // // -// CHECK-USE-I386-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-USE-I386-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK-USE-I386-NEXT: entry: -// CHECK-USE-I386-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-USE-I386-NEXT: ret void -// -// // CHECK-NOUSE-PPC64LE-LABEL: define {{[^@]+}}@_Z20explicit_maps_singlei // CHECK-NOUSE-PPC64LE-SAME: (i32 noundef signext [[II:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NOUSE-PPC64LE-NEXT: entry: @@ -1033,13 +1019,6 @@ void ST::test_present_members() { // CHECK-NOUSE-PPC64LE-NEXT: ret void // // -// CHECK-NOUSE-PPC64LE-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-NOUSE-PPC64LE-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK-NOUSE-PPC64LE-NEXT: entry: -// CHECK-NOUSE-PPC64LE-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NOUSE-PPC64LE-NEXT: ret void -// -// // CHECK-NOUSE-I386-LABEL: define {{[^@]+}}@_Z20explicit_maps_singlei // CHECK-NOUSE-I386-SAME: (i32 noundef [[II:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NOUSE-I386-NEXT: entry: @@ -1295,10 +1274,3 @@ void ST::test_present_members() { // CHECK-NOUSE-I386-NEXT: entry: // CHECK-NOUSE-I386-NEXT: ret void // -// -// CHECK-NOUSE-I386-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-NOUSE-I386-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK-NOUSE-I386-NEXT: entry: -// CHECK-NOUSE-I386-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NOUSE-I386-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_map_deref_array_codegen.cpp b/clang/test/OpenMP/target_map_deref_array_codegen.cpp index 11ff8125a0a99..da4176f20109c 100644 --- a/clang/test/OpenMP/target_map_deref_array_codegen.cpp +++ b/clang/test/OpenMP/target_map_deref_array_codegen.cpp @@ -303,10 +303,3 @@ void foo(int **t1d) // CHECK-NEXT: store i32 4, ptr [[ADD_PTR2]], align 4 // CHECK-NEXT: ret void // -// -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp index 89a9ee8018aa0..84844cff09b93 100644 --- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp +++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp @@ -83,7 +83,7 @@ void foo() { // // // CHECK-LABEL: define {{[^@]+}}@_ZN1BC1Eii -// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(12) [[THIS:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(12) [[THIS:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 @@ -167,7 +167,7 @@ void foo() { // CHECK-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 // CHECK-NEXT: br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK: omp_offload.failed: -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1B3runEv_l25(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1B3runEv_l25(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK: omp_offload.cont: // CHECK-NEXT: ret void @@ -324,7 +324,7 @@ void foo() { // CHECK-NEXT: [[TMP69:%.*]] = icmp ne i32 [[TMP68]], 0 // CHECK-NEXT: br i1 [[TMP69]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK: omp_offload.failed: -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l55(i64 [[TMP43]], ptr [[TMP44]]) #[[ATTR3]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l55(i64 [[TMP43]], ptr [[TMP44]]) #[[ATTR2]] // CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK: omp_offload.cont: // CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 @@ -408,14 +408,14 @@ void foo() { // CHECK-NEXT: [[TMP116:%.*]] = icmp ne i32 [[TMP115]], 0 // CHECK-NEXT: br i1 [[TMP116]], label [[OMP_OFFLOAD_FAILED23:%.*]], label [[OMP_OFFLOAD_CONT24:%.*]] // CHECK: omp_offload.failed23: -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l59(i64 [[TMP75]], ptr [[TMP76]]) #[[ATTR3]] +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l59(i64 [[TMP75]], ptr [[TMP76]]) #[[ATTR2]] // CHECK-NEXT: br label [[OMP_OFFLOAD_CONT24]] // CHECK: omp_offload.cont24: // CHECK-NEXT: ret void // // // CHECK-LABEL: define {{[^@]+}}@_ZN1BC2Eii -// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(12) [[THIS:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(12) [[THIS:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 @@ -433,7 +433,7 @@ void foo() { // // // CHECK-LABEL: define {{[^@]+}}@_ZN1AC2Eii -// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 @@ -452,7 +452,7 @@ void foo() { // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1B3runEv_l25 -// CHECK-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -468,7 +468,7 @@ void foo() { // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l55 -// CHECK-SAME: (i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR2]] { +// CHECK-SAME: (i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[CSIZE_ADDR:%.*]] = alloca i64, align 8 // CHECK-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 @@ -487,7 +487,7 @@ void foo() { // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l55.omp_outlined -// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR2]] { +// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -527,7 +527,7 @@ void foo() { // // // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l59 -// CHECK-SAME: (i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR2]] { +// CHECK-SAME: (i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[CSIZE_ADDR:%.*]] = alloca i64, align 8 // CHECK-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 @@ -561,10 +561,3 @@ void foo() { // CHECK: for.end: // CHECK-NEXT: ret void // -// -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR5:[0-9]+]] section ".text.startup" { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp index 5803a81c4b1a5..04360f1ea03bd 100644 --- a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp +++ b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp @@ -162,10 +162,3 @@ void host_dev(int device) { // MANDATORY: omp_offload.cont: // MANDATORY-NEXT: ret void // -// -// MANDATORY-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// MANDATORY-SAME: () #[[ATTR3:[0-9]+]] { -// MANDATORY-NEXT: entry: -// MANDATORY-NEXT: call void @__tgt_register_requires(i64 1) -// MANDATORY-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp index 220ffb9a740ad..a8b241c17d248 100644 --- a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp +++ b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp @@ -944,52 +944,52 @@ int bar(int n){ // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !27 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !27 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META27:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META27]] // CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !27 -// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !27 -// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !27 +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META27]] +// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !27 +// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]] // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK1-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias !27 +// CHECK1-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META27]] // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK1-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias !27 +// CHECK1-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias !27 +// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias !27 +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK1-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias !27 +// CHECK1-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8, !noalias !27 +// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK1-NEXT: store ptr null, ptr [[TMP22]], align 8, !noalias !27 +// CHECK1-NEXT: store ptr null, ptr [[TMP22]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK1-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias !27 +// CHECK1-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK1-NEXT: store i64 1, ptr [[TMP24]], align 8, !noalias !27 +// CHECK1-NEXT: store i64 1, ptr [[TMP24]], align 8, !noalias [[META27]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias !27 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META27]] // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias !27 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META27]] // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias !27 +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META27]] // CHECK1-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK1: omp_offload.failed.i: // CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27 -// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !27 +// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META27]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META27]] // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i64 [[TMP31]]) #[[ATTR2]] // CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK1: .omp_outlined..exit: @@ -1057,13 +1057,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3bari // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1829,52 +1822,52 @@ int bar(int n){ // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !28 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META28:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META28]] // CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]] -// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !28 -// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !28 -// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !28 +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !28 +// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK3-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias !28 +// CHECK3-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias !28 +// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias !28 +// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias !28 +// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK3-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias !28 +// CHECK3-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK3-NEXT: store ptr null, ptr [[TMP21]], align 4, !noalias !28 +// CHECK3-NEXT: store ptr null, ptr [[TMP21]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK3-NEXT: store ptr null, ptr [[TMP22]], align 4, !noalias !28 +// CHECK3-NEXT: store ptr null, ptr [[TMP22]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK3-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias !28 +// CHECK3-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META28]] // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK3-NEXT: store i64 1, ptr [[TMP24]], align 8, !noalias !28 +// CHECK3-NEXT: store i64 1, ptr [[TMP24]], align 8, !noalias [[META28]] // CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias !28 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias !28 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias !28 +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META28]] // CHECK3-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: omp_offload.failed.i: // CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK3-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !28 -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !28 +// CHECK3-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META28]] +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META28]] // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i32 [[TMP31]]) #[[ATTR2]] // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK3: .omp_outlined..exit: @@ -1942,13 +1935,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp index bb7999b3e55b1..84b7f1ae4ec04 100644 --- a/clang/test/OpenMP/target_parallel_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_codegen.cpp @@ -607,38 +607,38 @@ int bar(int n){ // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !21 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META21:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK1-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias !21 +// CHECK1-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK1-NEXT: store ptr null, ptr [[TMP10]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP10]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK1-NEXT: store ptr null, ptr [[TMP13]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP13]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK1-NEXT: store ptr null, ptr [[TMP15]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP15]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK1-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias !21 +// CHECK1-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK1-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias !21 +// CHECK1-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !21 +// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !21 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK1-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias !21 +// CHECK1-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK1-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 // CHECK1-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] @@ -1387,13 +1387,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3fooi // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1710,38 +1703,38 @@ int bar(int n){ // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META22:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK3-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK3-NEXT: store ptr null, ptr [[TMP10]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP10]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK3-NEXT: store ptr null, ptr [[TMP13]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP13]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK3-NEXT: store ptr null, ptr [[TMP15]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP15]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK3-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias !22 +// CHECK3-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias [[META22]] // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK3-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias !22 +// CHECK3-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias [[META22]] // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !22 +// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !22 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK3-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK3-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 // CHECK3-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] @@ -2490,13 +2483,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_parallel_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_codegen_registration.cpp index 352f8db60012c..dd3ec98bcfef7 100644 --- a/clang/test/OpenMP/target_parallel_codegen_registration.cpp +++ b/clang/test/OpenMP/target_parallel_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_parallel_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_depend_codegen.cpp index 22bf654691de6..9cc0f9cfb1699 100644 --- a/clang/test/OpenMP/target_parallel_depend_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; @@ -233,8 +229,4 @@ int foo(int n) { // CHECK: call void [[HVT2]](i[[SZ]] [[BP1]]) // CHECK: ret i32 0 -// CHECK: define internal void @.omp_offloading.requires_reg() -// CHECK: call void @__tgt_register_requires(i64 1) -// CHECK: ret void - #endif diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp index 697202b7ac412..31f6d0b8ad9ae 100644 --- a/clang/test/OpenMP/target_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp @@ -1031,60 +1031,60 @@ int bar(int n){ // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META24:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META24]] // CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24 -// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24 -// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24 -// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24 +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META24]] +// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 -// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24 +// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]] // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK1-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias !24 +// CHECK1-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias [[META24]] // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !24 +// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !24 +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !24 +// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !24 +// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK1-NEXT: store ptr null, ptr [[TMP23]], align 8, !noalias !24 +// CHECK1-NEXT: store ptr null, ptr [[TMP23]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8, !noalias !24 +// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK1-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias !24 +// CHECK1-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK1-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias !24 +// CHECK1-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !24 +// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias [[META24]] // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !24 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias [[META24]] // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK1-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias !24 +// CHECK1-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias [[META24]] // CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK1-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0 // CHECK1-NEXT: br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK1: omp_offload.failed.i: // CHECK1-NEXT: [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK1-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !24 -// CHECK1-NEXT: [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24 +// CHECK1-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias [[META24]] +// CHECK1-NEXT: [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK1-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !24 -// CHECK1-NEXT: [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !24 +// CHECK1-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias [[META24]] +// CHECK1-NEXT: [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias [[META24]] // CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK1-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !24 -// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !24 +// CHECK1-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias [[META24]] +// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias [[META24]] // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR3]] // CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK1: .omp_outlined..exit: @@ -1975,13 +1975,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z7get_valv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2701,60 +2694,60 @@ int bar(int n){ // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META25:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META25]] // CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25 +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 -// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25 +// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK3-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias !25 +// CHECK3-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !25 +// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !25 +// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !25 +// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !25 +// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK3-NEXT: store ptr null, ptr [[TMP23]], align 4, !noalias !25 +// CHECK3-NEXT: store ptr null, ptr [[TMP23]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4, !noalias !25 +// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK3-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias !25 +// CHECK3-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias [[META25]] // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK3-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias !25 +// CHECK3-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias [[META25]] // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !25 +// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !25 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK3-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias !25 +// CHECK3-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK3-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0 // CHECK3-NEXT: br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: omp_offload.failed.i: // CHECK3-NEXT: [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK3-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !25 -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25 +// CHECK3-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias [[META25]] +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK3-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !25 +// CHECK3-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias [[META25]] // CHECK3-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK3-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !25 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !25 +// CHECK3-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias [[META25]] +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias [[META25]] // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR3]] // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK3: .omp_outlined..exit: @@ -3645,13 +3638,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -5935,60 +5921,60 @@ int bar(int n){ // CHECK17-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) // CHECK17-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) // CHECK17-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) -// CHECK17-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24 -// CHECK17-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24 +// CHECK17-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META24:![0-9]+]] +// CHECK17-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META24]] // CHECK17-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK17-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24 -// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24 -// CHECK17-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24 -// CHECK17-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24 +// CHECK17-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META24]] +// CHECK17-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK17-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 -// CHECK17-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24 +// CHECK17-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]] // CHECK17-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK17-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias !24 +// CHECK17-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias [[META24]] // CHECK17-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK17-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !24 +// CHECK17-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK17-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !24 +// CHECK17-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK17-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !24 +// CHECK17-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK17-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !24 +// CHECK17-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK17-NEXT: store ptr null, ptr [[TMP23]], align 8, !noalias !24 +// CHECK17-NEXT: store ptr null, ptr [[TMP23]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK17-NEXT: store ptr null, ptr [[TMP24]], align 8, !noalias !24 +// CHECK17-NEXT: store ptr null, ptr [[TMP24]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK17-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias !24 +// CHECK17-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK17-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias !24 +// CHECK17-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK17-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !24 +// CHECK17-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias [[META24]] // CHECK17-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK17-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !24 +// CHECK17-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias [[META24]] // CHECK17-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK17-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias !24 +// CHECK17-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias [[META24]] // CHECK17-NEXT: [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK17-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0 // CHECK17-NEXT: br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK17: omp_offload.failed.i: // CHECK17-NEXT: [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK17-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !24 -// CHECK17-NEXT: [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24 +// CHECK17-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias [[META24]] +// CHECK17-NEXT: [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK17-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !24 -// CHECK17-NEXT: [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !24 +// CHECK17-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias [[META24]] +// CHECK17-NEXT: [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias [[META24]] // CHECK17-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK17-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !24 -// CHECK17-NEXT: [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !24 +// CHECK17-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias [[META24]] +// CHECK17-NEXT: [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias [[META24]] // CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR3]] // CHECK17-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK17: .omp_outlined..exit: @@ -6879,13 +6865,6 @@ int bar(int n){ // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR5]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z7get_valv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -7605,60 +7584,60 @@ int bar(int n){ // CHECK19-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) // CHECK19-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK19-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) -// CHECK19-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25 +// CHECK19-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META25:![0-9]+]] +// CHECK19-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META25]] // CHECK19-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK19-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25 +// CHECK19-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK19-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 -// CHECK19-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25 +// CHECK19-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK19-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias !25 +// CHECK19-NEXT: store i32 3, ptr [[TMP18]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK19-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !25 +// CHECK19-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK19-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !25 +// CHECK19-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK19-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !25 +// CHECK19-NEXT: store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK19-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !25 +// CHECK19-NEXT: store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK19-NEXT: store ptr null, ptr [[TMP23]], align 4, !noalias !25 +// CHECK19-NEXT: store ptr null, ptr [[TMP23]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK19-NEXT: store ptr null, ptr [[TMP24]], align 4, !noalias !25 +// CHECK19-NEXT: store ptr null, ptr [[TMP24]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK19-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias !25 +// CHECK19-NEXT: store i64 0, ptr [[TMP25]], align 8, !noalias [[META25]] // CHECK19-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK19-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias !25 +// CHECK19-NEXT: store i64 1, ptr [[TMP26]], align 8, !noalias [[META25]] // CHECK19-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK19-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !25 +// CHECK19-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK19-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !25 +// CHECK19-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK19-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias !25 +// CHECK19-NEXT: store i32 0, ptr [[TMP29]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK19-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0 // CHECK19-NEXT: br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK19: omp_offload.failed.i: // CHECK19-NEXT: [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK19-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !25 -// CHECK19-NEXT: [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25 +// CHECK19-NEXT: store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias [[META25]] +// CHECK19-NEXT: [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK19-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !25 +// CHECK19-NEXT: store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias [[META25]] // CHECK19-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK19-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !25 -// CHECK19-NEXT: [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !25 +// CHECK19-NEXT: store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias [[META25]] +// CHECK19-NEXT: [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias [[META25]] // CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR3]] // CHECK19-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK19: .omp_outlined..exit: @@ -8548,10 +8527,3 @@ int bar(int n){ // CHECK19-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) // CHECK19-NEXT: ret void // -// -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR5]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp index 6b13a842df114..b65241109b0f8 100644 --- a/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp +++ b/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp index 801b188e97ca9..69cfa2a6e5894 100644 --- a/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp index f78a71585ce7f..7751ae93d59fa 100644 --- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp @@ -714,38 +714,38 @@ int bar(int n){ // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25 -// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25 -// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25 -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25 -// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META25:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META25]] +// CHECK1-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META25]] +// CHECK1-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META25]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META25]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]] +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]] +// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]] // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK1-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias !25 +// CHECK1-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias [[META25]] // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK1-NEXT: store ptr null, ptr [[TMP10]], align 8, !noalias !25 +// CHECK1-NEXT: store ptr null, ptr [[TMP10]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8, !noalias !25 +// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8, !noalias !25 +// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK1-NEXT: store ptr null, ptr [[TMP13]], align 8, !noalias !25 +// CHECK1-NEXT: store ptr null, ptr [[TMP13]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8, !noalias !25 +// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK1-NEXT: store ptr null, ptr [[TMP15]], align 8, !noalias !25 +// CHECK1-NEXT: store ptr null, ptr [[TMP15]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK1-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias !25 +// CHECK1-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK1-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias !25 +// CHECK1-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias [[META25]] // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !25 +// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias [[META25]] // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !25 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias [[META25]] // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK1-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias !25 +// CHECK1-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias [[META25]] // CHECK1-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK1-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 // CHECK1-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] @@ -1917,13 +1917,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z7get_valv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2341,38 +2334,38 @@ int bar(int n){ // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26 -// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META26:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META26]] +// CHECK3-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META26]] +// CHECK3-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META26]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META26]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]] +// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]] +// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK3-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias !26 +// CHECK3-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK3-NEXT: store ptr null, ptr [[TMP10]], align 4, !noalias !26 +// CHECK3-NEXT: store ptr null, ptr [[TMP10]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4, !noalias !26 +// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4, !noalias !26 +// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK3-NEXT: store ptr null, ptr [[TMP13]], align 4, !noalias !26 +// CHECK3-NEXT: store ptr null, ptr [[TMP13]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4, !noalias !26 +// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK3-NEXT: store ptr null, ptr [[TMP15]], align 4, !noalias !26 +// CHECK3-NEXT: store ptr null, ptr [[TMP15]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK3-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias !26 +// CHECK3-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias [[META26]] // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK3-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias !26 +// CHECK3-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias [[META26]] // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !26 +// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !26 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK3-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias !26 +// CHECK3-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias [[META26]] // CHECK3-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK3-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 // CHECK3-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] @@ -3542,13 +3535,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z7get_valv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -3970,38 +3956,38 @@ int bar(int n){ // CHECK5-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) // CHECK5-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK5-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) -// CHECK5-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25 -// CHECK5-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25 -// CHECK5-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25 -// CHECK5-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25 -// CHECK5-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25 -// CHECK5-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25 -// CHECK5-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25 -// CHECK5-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25 +// CHECK5-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META25:![0-9]+]] +// CHECK5-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META25]] +// CHECK5-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META25]] +// CHECK5-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META25]] +// CHECK5-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META25]] +// CHECK5-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]] +// CHECK5-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]] +// CHECK5-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]] // CHECK5-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK5-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias !25 +// CHECK5-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias [[META25]] // CHECK5-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK5-NEXT: store ptr null, ptr [[TMP10]], align 8, !noalias !25 +// CHECK5-NEXT: store ptr null, ptr [[TMP10]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK5-NEXT: store ptr null, ptr [[TMP11]], align 8, !noalias !25 +// CHECK5-NEXT: store ptr null, ptr [[TMP11]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK5-NEXT: store ptr null, ptr [[TMP12]], align 8, !noalias !25 +// CHECK5-NEXT: store ptr null, ptr [[TMP12]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK5-NEXT: store ptr null, ptr [[TMP13]], align 8, !noalias !25 +// CHECK5-NEXT: store ptr null, ptr [[TMP13]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK5-NEXT: store ptr null, ptr [[TMP14]], align 8, !noalias !25 +// CHECK5-NEXT: store ptr null, ptr [[TMP14]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK5-NEXT: store ptr null, ptr [[TMP15]], align 8, !noalias !25 +// CHECK5-NEXT: store ptr null, ptr [[TMP15]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK5-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias !25 +// CHECK5-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK5-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias !25 +// CHECK5-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias [[META25]] // CHECK5-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK5-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !25 +// CHECK5-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias [[META25]] // CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK5-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !25 +// CHECK5-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias [[META25]] // CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK5-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias !25 +// CHECK5-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias [[META25]] // CHECK5-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK5-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 // CHECK5-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] @@ -5035,11 +5021,11 @@ int bar(int n){ // CHECK5-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to double // CHECK5-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 // CHECK5-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK5-NEXT: store double [[ADD]], ptr [[A]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]] +// CHECK5-NEXT: store double [[ADD]], ptr [[A]], align 8, !nontemporal [[META39:![0-9]+]], !llvm.access.group [[ACC_GRP38]] // CHECK5-NEXT: [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0 -// CHECK5-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]] +// CHECK5-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal [[META39]], !llvm.access.group [[ACC_GRP38]] // CHECK5-NEXT: [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00 -// CHECK5-NEXT: store double [[INC]], ptr [[A4]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]] +// CHECK5-NEXT: store double [[INC]], ptr [[A4]], align 8, !nontemporal [[META39]], !llvm.access.group [[ACC_GRP38]] // CHECK5-NEXT: [[CONV5:%.*]] = fptosi double [[INC]] to i16 // CHECK5-NEXT: [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]] // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]] @@ -5280,13 +5266,6 @@ int bar(int n){ // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@_Z7get_valv // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -5704,38 +5683,38 @@ int bar(int n){ // CHECK7-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) // CHECK7-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) // CHECK7-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) -// CHECK7-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26 -// CHECK7-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26 +// CHECK7-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META26:![0-9]+]] +// CHECK7-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META26]] +// CHECK7-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META26]] +// CHECK7-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META26]] +// CHECK7-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META26]] +// CHECK7-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]] +// CHECK7-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]] +// CHECK7-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK7-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias !26 +// CHECK7-NEXT: store i32 0, ptr [[TMP9]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK7-NEXT: store ptr null, ptr [[TMP10]], align 4, !noalias !26 +// CHECK7-NEXT: store ptr null, ptr [[TMP10]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK7-NEXT: store ptr null, ptr [[TMP11]], align 4, !noalias !26 +// CHECK7-NEXT: store ptr null, ptr [[TMP11]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK7-NEXT: store ptr null, ptr [[TMP12]], align 4, !noalias !26 +// CHECK7-NEXT: store ptr null, ptr [[TMP12]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK7-NEXT: store ptr null, ptr [[TMP13]], align 4, !noalias !26 +// CHECK7-NEXT: store ptr null, ptr [[TMP13]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK7-NEXT: store ptr null, ptr [[TMP14]], align 4, !noalias !26 +// CHECK7-NEXT: store ptr null, ptr [[TMP14]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK7-NEXT: store ptr null, ptr [[TMP15]], align 4, !noalias !26 +// CHECK7-NEXT: store ptr null, ptr [[TMP15]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK7-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias !26 +// CHECK7-NEXT: store i64 0, ptr [[TMP16]], align 8, !noalias [[META26]] // CHECK7-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK7-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias !26 +// CHECK7-NEXT: store i64 1, ptr [[TMP17]], align 8, !noalias [[META26]] // CHECK7-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK7-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !26 +// CHECK7-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK7-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !26 +// CHECK7-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK7-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias !26 +// CHECK7-NEXT: store i32 0, ptr [[TMP20]], align 4, !noalias [[META26]] // CHECK7-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK7-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 // CHECK7-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] @@ -6767,11 +6746,11 @@ int bar(int n){ // CHECK7-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to double // CHECK7-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 // CHECK7-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK7-NEXT: store double [[ADD]], ptr [[A]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]] +// CHECK7-NEXT: store double [[ADD]], ptr [[A]], align 4, !nontemporal [[META40:![0-9]+]], !llvm.access.group [[ACC_GRP39]] // CHECK7-NEXT: [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0 -// CHECK7-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]] +// CHECK7-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal [[META40]], !llvm.access.group [[ACC_GRP39]] // CHECK7-NEXT: [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00 -// CHECK7-NEXT: store double [[INC]], ptr [[A4]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]] +// CHECK7-NEXT: store double [[INC]], ptr [[A4]], align 4, !nontemporal [[META40]], !llvm.access.group [[ACC_GRP39]] // CHECK7-NEXT: [[CONV5:%.*]] = fptosi double [[INC]] to i16 // CHECK7-NEXT: [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]] // CHECK7-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]] @@ -7012,13 +6991,6 @@ int bar(int n){ // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z7get_valv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -8353,11 +8325,11 @@ int bar(int n){ // CHECK13-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP11]] to double // CHECK13-NEXT: [[ADD3:%.*]] = fadd double [[CONV]], 1.500000e+00 // CHECK13-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK13-NEXT: store double [[ADD3]], ptr [[A]], align 8, !nontemporal !19, !llvm.access.group [[ACC_GRP18]] +// CHECK13-NEXT: store double [[ADD3]], ptr [[A]], align 8, !nontemporal [[META19:![0-9]+]], !llvm.access.group [[ACC_GRP18]] // CHECK13-NEXT: [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP12:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !19, !llvm.access.group [[ACC_GRP18]] +// CHECK13-NEXT: [[TMP12:%.*]] = load double, ptr [[A4]], align 8, !nontemporal [[META19]], !llvm.access.group [[ACC_GRP18]] // CHECK13-NEXT: [[INC:%.*]] = fadd double [[TMP12]], 1.000000e+00 -// CHECK13-NEXT: store double [[INC]], ptr [[A4]], align 8, !nontemporal !19, !llvm.access.group [[ACC_GRP18]] +// CHECK13-NEXT: store double [[INC]], ptr [[A4]], align 8, !nontemporal [[META19]], !llvm.access.group [[ACC_GRP18]] // CHECK13-NEXT: [[CONV5:%.*]] = fptosi double [[INC]] to i16 // CHECK13-NEXT: [[TMP13:%.*]] = mul nsw i64 1, [[TMP2]] // CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[VLA]], i64 [[TMP13]] @@ -8876,11 +8848,11 @@ int bar(int n){ // CHECK15-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP10]] to double // CHECK15-NEXT: [[ADD3:%.*]] = fadd double [[CONV]], 1.500000e+00 // CHECK15-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK15-NEXT: store double [[ADD3]], ptr [[A]], align 4, !nontemporal !20, !llvm.access.group [[ACC_GRP19]] +// CHECK15-NEXT: store double [[ADD3]], ptr [[A]], align 4, !nontemporal [[META20:![0-9]+]], !llvm.access.group [[ACC_GRP19]] // CHECK15-NEXT: [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP11:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !20, !llvm.access.group [[ACC_GRP19]] +// CHECK15-NEXT: [[TMP11:%.*]] = load double, ptr [[A4]], align 4, !nontemporal [[META20]], !llvm.access.group [[ACC_GRP19]] // CHECK15-NEXT: [[INC:%.*]] = fadd double [[TMP11]], 1.000000e+00 -// CHECK15-NEXT: store double [[INC]], ptr [[A4]], align 4, !nontemporal !20, !llvm.access.group [[ACC_GRP19]] +// CHECK15-NEXT: store double [[INC]], ptr [[A4]], align 4, !nontemporal [[META20]], !llvm.access.group [[ACC_GRP19]] // CHECK15-NEXT: [[CONV5:%.*]] = fptosi double [[INC]] to i16 // CHECK15-NEXT: [[TMP12:%.*]] = mul nsw i32 1, [[TMP1]] // CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[VLA]], i32 [[TMP12]] @@ -11323,11 +11295,11 @@ int bar(int n){ // CHECK21-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to double // CHECK21-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 // CHECK21-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK21-NEXT: store double [[ADD]], ptr [[A]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]] +// CHECK21-NEXT: store double [[ADD]], ptr [[A]], align 8, !nontemporal [[META27:![0-9]+]], !llvm.access.group [[ACC_GRP26]] // CHECK21-NEXT: [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0 -// CHECK21-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]] +// CHECK21-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal [[META27]], !llvm.access.group [[ACC_GRP26]] // CHECK21-NEXT: [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00 -// CHECK21-NEXT: store double [[INC]], ptr [[A4]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]] +// CHECK21-NEXT: store double [[INC]], ptr [[A4]], align 8, !nontemporal [[META27]], !llvm.access.group [[ACC_GRP26]] // CHECK21-NEXT: [[CONV5:%.*]] = fptosi double [[INC]] to i16 // CHECK21-NEXT: [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]] // CHECK21-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]] @@ -12213,11 +12185,11 @@ int bar(int n){ // CHECK23-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to double // CHECK23-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 // CHECK23-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 -// CHECK23-NEXT: store double [[ADD]], ptr [[A]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]] +// CHECK23-NEXT: store double [[ADD]], ptr [[A]], align 4, !nontemporal [[META28:![0-9]+]], !llvm.access.group [[ACC_GRP27]] // CHECK23-NEXT: [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0 -// CHECK23-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]] +// CHECK23-NEXT: [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal [[META28]], !llvm.access.group [[ACC_GRP27]] // CHECK23-NEXT: [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00 -// CHECK23-NEXT: store double [[INC]], ptr [[A4]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]] +// CHECK23-NEXT: store double [[INC]], ptr [[A4]], align 4, !nontemporal [[META28]], !llvm.access.group [[ACC_GRP27]] // CHECK23-NEXT: [[CONV5:%.*]] = fptosi double [[INC]] to i16 // CHECK23-NEXT: [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]] // CHECK23-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]] diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp index 4d0eec6571738..4c996c88de530 100644 --- a/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp index f848922a034e6..b3fdb53c80c5c 100644 --- a/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp index 1df762c9fa0ee..fa3d182b62ad2 100644 --- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp +++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp @@ -426,13 +426,6 @@ int bar(int a){ // CHECK-NEXT: ret void // // -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR3]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// -// // // // @@ -1458,7 +1451,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4:[0-9]+]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1468,7 +1461,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAD1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1478,7 +1471,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1497,7 +1490,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAD2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1532,7 +1525,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1542,7 +1535,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBD1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1552,7 +1545,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1571,7 +1564,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBD2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1606,7 +1599,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1616,7 +1609,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCD1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1626,7 +1619,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1725,7 +1718,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCD2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1752,7 +1745,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1762,7 +1755,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDD1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1772,7 +1765,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1791,7 +1784,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDD2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -1898,7 +1891,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SEC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1908,7 +1901,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SED1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1918,7 +1911,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SEC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -2017,7 +2010,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SED2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -2124,7 +2117,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EEC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -2134,7 +2127,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EED1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -2144,7 +2137,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EEC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -2243,7 +2236,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EED2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -2350,7 +2343,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -2360,7 +2353,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EED1Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -2370,7 +2363,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -2469,7 +2462,7 @@ int bar(int a){ // // // CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EED2Ev -// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat { +// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat { // CHECK-NTARGET-NEXT: entry: // CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3119,7 +3112,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_Z3bari -// SIMD-ONLY2-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// SIMD-ONLY2-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR1]] { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // SIMD-ONLY2-NEXT: [[R:%.*]] = alloca i32, align 4 @@ -3161,7 +3154,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SA3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3180,7 +3173,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SB3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3214,7 +3207,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SC3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3233,7 +3226,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SD3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3252,7 +3245,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SE3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3286,7 +3279,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3320,7 +3313,7 @@ int bar(int a){ // // // SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv -// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR3]] comdat { +// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR1]] comdat { // SIMD-ONLY2-NEXT: entry: // SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -3927,13 +3920,6 @@ int bar(int a){ // OMP-DEFAULT-NEXT: ret void // // -// OMP-DEFAULT-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// OMP-DEFAULT-SAME: () #[[ATTR0]] { -// OMP-DEFAULT-NEXT: entry: -// OMP-DEFAULT-NEXT: call void @__tgt_register_requires(i64 1) -// OMP-DEFAULT-NEXT: ret void -// -// // // // @@ -5467,7 +5453,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_Z3bari -// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR4:[0-9]+]] { +// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR1]] { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // OMP-DEfAULT-NEXT: [[R:%.*]] = alloca i32, align 4 @@ -5540,7 +5526,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SA3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -5559,7 +5545,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SB3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -5624,7 +5610,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SC3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -5643,7 +5629,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SD3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -5662,7 +5648,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SE3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -5683,7 +5669,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -5748,7 +5734,7 @@ int bar(int a){ // // // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv -// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 { +// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR1]] comdat align 2 { // OMP-DEfAULT-NEXT: entry: // OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -6229,13 +6215,6 @@ int bar(int a){ // OMP-DEfAULT-NEXT: ret void // // -// OMP-DEfAULT-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// OMP-DEfAULT-SAME: () #[[ATTR0]] { -// OMP-DEfAULT-NEXT: entry: -// OMP-DEfAULT-NEXT: call void @__tgt_register_requires(i64 1) -// OMP-DEfAULT-NEXT: ret void -// -// // // // @@ -7801,7 +7780,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_Z3bari -// CHECK-NTARGET-OMP-DEFAULT-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR1]] { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[R:%.*]] = alloca i32, align 4 @@ -7830,7 +7809,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SA3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -7849,7 +7828,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SB3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -7870,7 +7849,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SC3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -7889,7 +7868,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SD3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -7908,7 +7887,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SE3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -7929,7 +7908,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 @@ -7950,7 +7929,7 @@ int bar(int a){ // // // CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv -// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR1]] comdat { // CHECK-NTARGET-OMP-DEFAULT-NEXT: entry: // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4 diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp index 722a9c1bfce79..a7b7278dff8d9 100644 --- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp +++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp @@ -283,13 +283,6 @@ int nested(int a){ // CHECK-NEXT: ret void // // -// CHECK-LABEL: define internal void @.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// -// // CHECK-X86-LABEL: define dso_local noundef i32 @_Z6nestedi // CHECK-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-X86-NEXT: entry: @@ -508,13 +501,6 @@ int nested(int a){ // CHECK-X86-NEXT: ret void // // -// CHECK-X86-LABEL: define internal void @.omp_offloading.requires_reg -// CHECK-X86-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK-X86-NEXT: entry: -// CHECK-X86-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-X86-NEXT: ret void -// -// // SIMD-ONLY0-LABEL: define dso_local noundef signext i32 @_Z6nestedi // SIMD-ONLY0-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] { // SIMD-ONLY0-NEXT: entry: diff --git a/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp index 97bc63d49f7fc..c7fb1e5135d37 100644 --- a/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp @@ -133,13 +133,6 @@ int foo(int n) { // // // -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// -// // // // diff --git a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp index 50b38c4855d0a..472811c5c5667 100644 --- a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp @@ -202,10 +202,3 @@ void foo() { // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]]) // CHECK-NEXT: ret void // -// -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_parallel_if_codegen.cpp b/clang/test/OpenMP/target_parallel_if_codegen.cpp index 5ad2c6a9879f8..841c49e31ccb9 100644 --- a/clang/test/OpenMP/target_parallel_if_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_if_codegen.cpp @@ -844,13 +844,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3bari // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1513,13 +1506,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp index 434c18fa3fcdc..0778bca8c3f43 100644 --- a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp @@ -767,13 +767,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3bari // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1352,13 +1345,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_simd_codegen.cpp b/clang/test/OpenMP/target_simd_codegen.cpp index 9dd8b07010f13..e2ff3d5f53e44 100644 --- a/clang/test/OpenMP/target_simd_codegen.cpp +++ b/clang/test/OpenMP/target_simd_codegen.cpp @@ -110,10 +110,6 @@ // TCHECK: @{{.+}} = weak constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_simd_codegen_registration.cpp b/clang/test/OpenMP/target_simd_codegen_registration.cpp index 497055fbf0866..4c0ca8947b3fc 100644 --- a/clang/test/OpenMP/target_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_simd_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_simd_depend_codegen.cpp b/clang/test/OpenMP/target_simd_depend_codegen.cpp index e7ce853d7eb0c..fa341d8f10659 100644 --- a/clang/test/OpenMP/target_simd_depend_codegen.cpp +++ b/clang/test/OpenMP/target_simd_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp index 26ccfde37a9f3..472c50235bf9e 100644 --- a/clang/test/OpenMP/target_task_affinity_codegen.cpp +++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp @@ -241,51 +241,44 @@ int main() { // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META7:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META11:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !13 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !13 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META13:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META13]] // CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR1]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !13 -// CHECK1-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !13 +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META13]] +// CHECK1-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META13]] // CHECK1-NEXT: br label [[FOR_COND_I:%.*]] // CHECK1: for.cond.i: -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !13 +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META13]] // CHECK1-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP13]], 1024 // CHECK1-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK1: for.body.i: // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP12]], align 8 -// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !13 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META13]] // CHECK1-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP15]] to i64 // CHECK1-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[IDXPROM_I]] // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I]], align 4 // CHECK1-NEXT: [[MUL_I:%.*]] = mul nsw i32 2, [[TMP16]] // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP9]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[TMP17]], align 8 -// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !13 +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META13]] // CHECK1-NEXT: [[IDXPROM1_I:%.*]] = sext i32 [[TMP19]] to i64 // CHECK1-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[IDXPROM1_I]] // CHECK1-NEXT: store i32 [[MUL_I]], ptr [[ARRAYIDX2_I]], align 4 -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !13 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META13]] // CHECK1-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK1-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias !13 +// CHECK1-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias [[META13]] // CHECK1-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP14:![0-9]+]] // CHECK1: .omp_outlined..exit: // CHECK1-NEXT: ret i32 0 // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -471,49 +464,42 @@ int main() { // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !14 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META14:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR1]] -// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !14 -// CHECK3-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META14]] +// CHECK3-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: br label [[FOR_COND_I:%.*]] // CHECK3: for.cond.i: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP13]], 1024 // CHECK3-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: for.body.i: // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP12]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 [[TMP15]] // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I]], align 4 // CHECK3-NEXT: [[MUL_I:%.*]] = mul nsw i32 2, [[TMP16]] // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP9]], align 4 // CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[TMP17]], align 4 -// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 [[TMP19]] // CHECK3-NEXT: store i32 [[MUL_I]], ptr [[ARRAYIDX1_I]], align 4 -// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK3-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias !14 +// CHECK3-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias [[META14]] // CHECK3-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP15:![0-9]+]] // CHECK3: .omp_outlined..exit: // CHECK3-NEXT: ret i32 0 // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -668,39 +654,39 @@ int main() { // CHECK9-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) // CHECK9-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) // CHECK9-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK9-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 -// CHECK9-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 +// CHECK9-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META14:![0-9]+]] +// CHECK9-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META14]] // CHECK9-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR3:[0-9]+]] -// CHECK9-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !14 -// CHECK9-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !14 +// CHECK9-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META14]] +// CHECK9-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK9-NEXT: br label [[FOR_COND_I:%.*]] // CHECK9: for.cond.i: -// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK9-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP13]], 1024 // CHECK9-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK9: for.body.i: // CHECK9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP12]], align 8 -// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK9-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP15]] to i64 // CHECK9-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[IDXPROM_I]] // CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I]], align 4 // CHECK9-NEXT: [[MUL_I:%.*]] = mul nsw i32 2, [[TMP16]] // CHECK9-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP9]], align 8 // CHECK9-NEXT: [[TMP18:%.*]] = load ptr, ptr [[TMP17]], align 8 -// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK9-NEXT: [[IDXPROM1_I:%.*]] = sext i32 [[TMP19]] to i64 // CHECK9-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[IDXPROM1_I]] // CHECK9-NEXT: store i32 [[MUL_I]], ptr [[ARRAYIDX2_I]], align 4 -// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !14 +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META14]] // CHECK9-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK9-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias !14 +// CHECK9-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias [[META14]] // CHECK9-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP15:![0-9]+]] // CHECK9: .omp_outlined..exit: // CHECK9-NEXT: ret i32 0 @@ -790,37 +776,37 @@ int main() { // CHECK11-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) // CHECK11-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META11:![0-9]+]]) // CHECK11-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]]) -// CHECK11-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !15 +// CHECK11-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META15:![0-9]+]] +// CHECK11-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR3:[0-9]+]] -// CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !15 -// CHECK11-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias !15 +// CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META15]] +// CHECK11-NEXT: store i32 0, ptr [[I_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: br label [[FOR_COND_I:%.*]] // CHECK11: for.cond.i: -// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !15 +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP13]], 1024 // CHECK11-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK11: for.body.i: // CHECK11-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP12]], align 4 -// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !15 +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 [[TMP15]] // CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I]], align 4 // CHECK11-NEXT: [[MUL_I:%.*]] = mul nsw i32 2, [[TMP16]] // CHECK11-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP9]], align 4 // CHECK11-NEXT: [[TMP18:%.*]] = load ptr, ptr [[TMP17]], align 4 -// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !15 +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 [[TMP19]] // CHECK11-NEXT: store i32 [[MUL_I]], ptr [[ARRAYIDX1_I]], align 4 -// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias !15 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[I_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP20]], 1 -// CHECK11-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias !15 +// CHECK11-NEXT: store i32 [[INC_I]], ptr [[I_I]], align 4, !noalias [[META15]] // CHECK11-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP16:![0-9]+]] // CHECK11: .omp_outlined..exit: // CHECK11-NEXT: ret i32 0 diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp index 8790a0fc87cbb..e22fcbbcd277b 100644 --- a/clang/test/OpenMP/target_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_codegen.cpp @@ -1897,13 +1897,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3fooi // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -3492,13 +3485,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_codegen_registration.cpp b/clang/test/OpenMP/target_teams_codegen_registration.cpp index 141e803bb47ff..60c596853a64f 100644 --- a/clang/test/OpenMP/target_teams_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_teams_depend_codegen.cpp b/clang/test/OpenMP/target_teams_depend_codegen.cpp index 844dc83857837..56a408e49d447 100644 --- a/clang/test/OpenMP/target_teams_depend_codegen.cpp +++ b/clang/test/OpenMP/target_teams_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_teams_distribute_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_codegen.cpp index ce489d3b4e2a3..1ad3d8333ba60 100644 --- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp @@ -792,64 +792,64 @@ int bar(int n){ // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !21 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META21:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META21]] // CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !21 -// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !21 +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META21]] +// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4 // CHECK1-NEXT: [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0 // CHECK1-NEXT: [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0 -// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !21 +// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK1-NEXT: store i32 3, ptr [[TMP22]], align 4, !noalias !21 +// CHECK1-NEXT: store i32 3, ptr [[TMP22]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP23]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP23]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP24]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP24]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP25]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP25]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP26]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP26]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK1-NEXT: store ptr null, ptr [[TMP27]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP27]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK1-NEXT: store ptr null, ptr [[TMP28]], align 8, !noalias !21 +// CHECK1-NEXT: store ptr null, ptr [[TMP28]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK1-NEXT: store i64 10, ptr [[TMP29]], align 8, !noalias !21 +// CHECK1-NEXT: store i64 10, ptr [[TMP29]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK1-NEXT: store i64 1, ptr [[TMP30]], align 8, !noalias !21 +// CHECK1-NEXT: store i64 1, ptr [[TMP30]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK1-NEXT: store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias !21 +// CHECK1-NEXT: store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK1-NEXT: store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias !21 +// CHECK1-NEXT: store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK1-NEXT: store i32 0, ptr [[TMP33]], align 4, !noalias !21 +// CHECK1-NEXT: store i32 0, ptr [[TMP33]], align 4, !noalias [[META21]] // CHECK1-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 [[TMP19]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK1-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0 // CHECK1-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK1: omp_offload.failed.i: // CHECK1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK1-NEXT: store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias !21 -// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !21 +// CHECK1-NEXT: store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias [[META21]] +// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK1-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !21 -// CHECK1-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !21 +// CHECK1-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META21]] +// CHECK1-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META21]] // CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK1-NEXT: store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !21 -// CHECK1-NEXT: [[TMP41:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !21 +// CHECK1-NEXT: store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META21]] +// CHECK1-NEXT: [[TMP41:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias [[META21]] // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103(i64 [[TMP37]], i64 [[TMP39]], i64 [[TMP41]]) #[[ATTR3]] // CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK1: .omp_outlined..exit: @@ -2032,13 +2032,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3fooi // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2537,64 +2530,64 @@ int bar(int n){ // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META22:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !22 +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4 // CHECK3-NEXT: [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0 // CHECK3-NEXT: [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0 -// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK3-NEXT: store i32 3, ptr [[TMP22]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 3, ptr [[TMP22]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP23]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP23]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP24]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP24]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP25]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP25]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP26]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP26]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK3-NEXT: store ptr null, ptr [[TMP27]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP27]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK3-NEXT: store ptr null, ptr [[TMP28]], align 4, !noalias !22 +// CHECK3-NEXT: store ptr null, ptr [[TMP28]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK3-NEXT: store i64 10, ptr [[TMP29]], align 8, !noalias !22 +// CHECK3-NEXT: store i64 10, ptr [[TMP29]], align 8, !noalias [[META22]] // CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK3-NEXT: store i64 1, ptr [[TMP30]], align 8, !noalias !22 +// CHECK3-NEXT: store i64 1, ptr [[TMP30]], align 8, !noalias [[META22]] // CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK3-NEXT: store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias !22 +// CHECK3-NEXT: store [3 x i32] [[TMP20]], ptr [[TMP31]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK3-NEXT: store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias !22 +// CHECK3-NEXT: store [3 x i32] [[TMP21]], ptr [[TMP32]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK3-NEXT: store i32 0, ptr [[TMP33]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 0, ptr [[TMP33]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 [[TMP19]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK3-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0 // CHECK3-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: omp_offload.failed.i: // CHECK3-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK3-NEXT: store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias !22 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !22 +// CHECK3-NEXT: store i16 [[TMP36]], ptr [[AA_CASTED_I]], align 2, !noalias [[META22]] +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK3-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK3-NEXT: store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !22 -// CHECK3-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !22 +// CHECK3-NEXT: store i32 [[TMP40]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META22]] +// CHECK3-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META22]] // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103(i32 [[TMP37]], i32 [[TMP39]], i32 [[TMP41]]) #[[ATTR3]] // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK3: .omp_outlined..exit: @@ -3777,13 +3770,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp index e888a1c088d90..d619927d5bb80 100644 --- a/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp index 56579b33c1779..2d886109b95b9 100644 --- a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp @@ -255,13 +255,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -421,13 +414,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -890,13 +876,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1353,10 +1332,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp index 108a6811dbc1c..b496b14394d4e 100644 --- a/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp index 5d940d020deaa..714ed708e71a8 100644 --- a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp @@ -534,13 +534,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -950,13 +943,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1998,13 +1984,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -3041,10 +3020,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp index 540d28cf9bc95..1d6f69079df96 100644 --- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp @@ -528,7 +528,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -941,13 +941,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1335,7 +1328,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1746,13 +1739,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1877,7 +1863,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 @@ -1905,7 +1891,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1993,10 +1979,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp index 221ebf3767741..ffba1c2221c24 100644 --- a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp @@ -294,13 +294,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -457,13 +450,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -795,7 +781,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1165,13 +1151,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1501,7 +1480,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1868,10 +1847,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp index d55a7348b5840..9e12880be298d 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp @@ -1207,13 +1207,6 @@ int target_teams_fun(int *g){ // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK2-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK2-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_Z16target_teams_funPi // CHECK4-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -1846,13 +1839,6 @@ int target_teams_fun(int *g){ // CHECK4-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK4-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK4-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51 // CHECK10-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp index 46f612c0db28c..d13920ba956f1 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp @@ -335,13 +335,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -572,13 +565,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1242,13 +1228,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1906,10 +1885,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp index f6e201e2fc802..41ad165b40df2 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp index afd15f4c30dc3..517ea936a1d68 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp @@ -768,13 +768,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1394,13 +1387,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -3021,13 +3007,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -4619,10 +4598,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp index 55ceeafea83fe..9f11929ec372d 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp @@ -719,7 +719,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1262,13 +1262,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1782,7 +1775,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2319,13 +2312,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2450,7 +2436,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99 -// CHECK5-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK5-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK5-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 @@ -2478,7 +2464,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2559,7 +2545,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2658,13 +2644,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122 // CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp index b64e624c09372..fe2088aace15e 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp @@ -1567,10 +1567,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp index 6142e9113660e..e2181e5088cc9 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp @@ -447,13 +447,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -718,13 +711,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1210,7 +1196,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR1]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1725,13 +1711,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@main // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -2211,7 +2190,7 @@ int main() { // // // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK7-SAME: () #[[ATTR1]] comdat { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK7-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2719,10 +2698,3 @@ int main() { // CHECK7-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK7-NEXT: ret void // -// -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp index 98593d5c1224c..74dee0399f58e 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp @@ -198,10 +198,3 @@ void gtid_test() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] section ".text.startup" { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp index 2e45b84d6a37d..82f29fa1d3ef4 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp @@ -550,7 +550,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -913,13 +913,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1284,7 +1277,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1641,13 +1634,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1772,14 +1758,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104 -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR4:[0-9]+]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1846,7 +1832,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1944,13 +1930,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124 // CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp index 3642c9ca4c62b..6d22bc22e4796 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp @@ -581,10 +581,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp index fa48c02a65647..bfa00ee7a0f4b 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp @@ -222,8 +222,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -321,8 +321,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -512,8 +512,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -611,8 +611,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -669,13 +669,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -806,8 +799,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -903,8 +896,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1092,8 +1085,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1189,8 +1182,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1247,13 +1240,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1341,8 +1327,8 @@ int main() { // CHECK5-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1444,8 +1430,8 @@ int main() { // CHECK5-NEXT: store ptr [[SIVAR2]], ptr [[TMP15]], align 8 // CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1501,10 +1487,3 @@ int main() { // CHECK5-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 // CHECK5-NEXT: ret void // -// -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp index 3cc57b74616f1..f71a5ca734377 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp @@ -1221,13 +1221,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2228,13 +2221,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -3262,13 +3248,6 @@ int main (int argc, char **argv) { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -4269,13 +4248,6 @@ int main (int argc, char **argv) { // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -6927,13 +6899,6 @@ int main (int argc, char **argv) { // CHECK13-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK13-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK13-NEXT: ret void -// -// // CHECK15-LABEL: define {{[^@]+}}@main // CHECK15-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK15-NEXT: entry: @@ -9537,13 +9502,6 @@ int main (int argc, char **argv) { // CHECK15-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK15-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK15-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@main // CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -12195,13 +12153,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@main // CHECK19-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -14804,10 +14755,3 @@ int main (int argc, char **argv) { // CHECK19: omp.dispatch.end: // CHECK19-NEXT: ret void // -// -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp index 74742ce19ca8e..5294f0d65eb6d 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp @@ -1017,13 +1017,6 @@ void test_target_teams_atomic() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z16target_teams_funPi // CHECK3-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1945,13 +1938,6 @@ void test_target_teams_atomic() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z16target_teams_funPi // CHECK5-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp index 71ad4421450b8..f93ac400a2660 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp index a1c61a055606f..2b4d31d145240 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp @@ -351,13 +351,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -604,13 +597,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1466,13 +1452,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2183,13 +2162,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp index bea0ffce42d1c..f1e5c6842a852 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp @@ -51,10 +51,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp index ac1b7da71ef51..90fa290370b45 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp @@ -810,13 +810,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1478,13 +1471,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -3466,13 +3452,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -5179,13 +5158,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp index a7f257773b9c5..bf79fe669d83a 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp @@ -731,7 +731,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1288,13 +1288,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1822,7 +1815,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2373,13 +2366,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2504,7 +2490,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99 -// CHECK5-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK5-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK5-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 @@ -2532,7 +2518,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2620,7 +2606,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2726,13 +2712,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -2860,7 +2839,7 @@ int main() { // // // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK7-SAME: () #[[ATTR1]] comdat { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK7-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3181,7 +3160,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp index 8fd51fb17ff5d..329cd788c8bab 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp @@ -1712,13 +1712,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1871,7 +1864,7 @@ int main() { // CHECK3-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARG_ADDR]], align 4, !nontemporal !10, !llvm.access.group [[ACC_GRP9]] +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARG_ADDR]], align 4, !nontemporal [[META10:![0-9]+]], !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: store i32 [[TMP11]], ptr [[ARG_CASTED]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: [[TMP12:%.*]] = load i64, ptr [[ARG_CASTED]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l45.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]]), !llvm.access.group [[ACC_GRP9]] @@ -1953,7 +1946,7 @@ int main() { // CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK3-NEXT: store i32 0, ptr [[ARG_ADDR]], align 4, !nontemporal !10, !llvm.access.group [[ACC_GRP14]] +// CHECK3-NEXT: store i32 0, ptr [[ARG_ADDR]], align 4, !nontemporal [[META10]], !llvm.access.group [[ACC_GRP14]] // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -3554,13 +3547,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -3871,7 +3857,7 @@ int main() { // CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1 // CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK7-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK7-NEXT: store i32 0, ptr @Arg, align 4, !nontemporal !3, !llvm.access.group [[ACC_GRP2]] +// CHECK7-NEXT: store i32 0, ptr @Arg, align 4, !nontemporal [[META3:![0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK7: omp.body.continue: // CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -5759,13 +5745,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -5918,7 +5897,7 @@ int main() { // CHECK11-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 // CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK11-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARG_ADDR]], align 4, !nontemporal !10, !llvm.access.group [[ACC_GRP9]] +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARG_ADDR]], align 4, !nontemporal [[META10:![0-9]+]], !llvm.access.group [[ACC_GRP9]] // CHECK11-NEXT: store i32 [[TMP11]], ptr [[ARG_CASTED]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK11-NEXT: [[TMP12:%.*]] = load i64, ptr [[ARG_CASTED]], align 8, !llvm.access.group [[ACC_GRP9]] // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l45.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]]), !llvm.access.group [[ACC_GRP9]] @@ -6000,7 +5979,7 @@ int main() { // CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 // CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]] -// CHECK11-NEXT: store i32 0, ptr [[ARG_ADDR]], align 4, !nontemporal !10, !llvm.access.group [[ACC_GRP14]] +// CHECK11-NEXT: store i32 0, ptr [[ARG_ADDR]], align 4, !nontemporal [[META10]], !llvm.access.group [[ACC_GRP14]] // CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK11: omp.body.continue: // CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -7601,13 +7580,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -7918,7 +7890,7 @@ int main() { // CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1 // CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK15-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] -// CHECK15-NEXT: store i32 0, ptr @Arg, align 4, !nontemporal !3, !llvm.access.group [[ACC_GRP2]] +// CHECK15-NEXT: store i32 0, ptr @Arg, align 4, !nontemporal [[META3:![0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK15: omp.body.continue: // CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp index 65b57ceb39adf..903ecb865a250 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp @@ -461,13 +461,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -746,13 +739,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1252,7 +1238,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR1]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1781,13 +1767,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@main // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -2281,7 +2260,7 @@ int main() { // // // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK7-SAME: () #[[ATTR1]] comdat { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK7-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2804,13 +2783,6 @@ int main() { // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -2956,14 +2928,14 @@ int main() { // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) // CHECK13-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK13-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done13: @@ -2975,11 +2947,11 @@ int main() { // CHECK13: arraydestroy.body16: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK13: arraydestroy.done20: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP20]] // @@ -3013,12 +2985,12 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3118,14 +3090,14 @@ int main() { // CHECK13: omp.arraycpy.done11: // CHECK13-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP6]], align 8 // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done13: @@ -3136,11 +3108,11 @@ int main() { // CHECK13: arraydestroy.body15: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAY_BEGIN14]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15]] // CHECK13: arraydestroy.done19: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP19]] // @@ -3208,7 +3180,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -3353,14 +3325,14 @@ int main() { // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK15-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE10]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done12: @@ -3372,11 +3344,11 @@ int main() { // CHECK15: arraydestroy.body15: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAY_BEGIN14]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15]] // CHECK15: arraydestroy.done19: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP20]] // @@ -3410,12 +3382,12 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3513,14 +3485,14 @@ int main() { // CHECK15: omp.arraycpy.done10: // CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP6]], align 4 // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE10]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done12: @@ -3531,11 +3503,11 @@ int main() { // CHECK15: arraydestroy.body14: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST15:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT16:%.*]], [[ARRAYDESTROY_BODY14]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT16]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST15]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT16]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT16]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE17:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT16]], [[ARRAY_BEGIN13]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE17]], label [[ARRAYDESTROY_DONE18:%.*]], label [[ARRAYDESTROY_BODY14]] // CHECK15: arraydestroy.done18: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP19]] // @@ -3603,7 +3575,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp index 48ead1c3e43ce..7a211b4cd06b2 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp @@ -941,13 +941,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1697,13 +1690,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2014,13 +2000,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp index 717ef52ea73f1..31941b48f92df 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp @@ -624,13 +624,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp index 6ca9097ac9b6c..f255c3f084dbf 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp @@ -229,8 +229,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -335,8 +335,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -533,8 +533,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -639,8 +639,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -697,13 +697,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -841,8 +834,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -945,8 +938,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP16]], align 4 // CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1141,8 +1134,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1245,8 +1238,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP16]], align 4 // CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1303,13 +1296,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1404,8 +1390,8 @@ int main() { // CHECK5-NEXT: store ptr [[SIVAR1]], ptr [[TMP16]], align 8 // CHECK5-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1514,8 +1500,8 @@ int main() { // CHECK5-NEXT: store ptr [[SIVAR2]], ptr [[TMP17]], align 8 // CHECK5-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP18]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1572,13 +1558,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@main // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp index 4a2a71ca83891..b41571eb415d9 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp @@ -1291,13 +1291,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2368,13 +2361,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -3472,13 +3458,6 @@ int main (int argc, char **argv) { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -4549,13 +4528,6 @@ int main (int argc, char **argv) { // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -7782,13 +7754,6 @@ int main (int argc, char **argv) { // CHECK13-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK13-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK13-NEXT: ret void -// -// // CHECK15-LABEL: define {{[^@]+}}@main // CHECK15-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK15-NEXT: entry: @@ -10582,13 +10547,6 @@ int main (int argc, char **argv) { // CHECK15-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK15-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK15-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@main // CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -13430,13 +13388,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@main // CHECK19-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -16230,13 +16181,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK21-LABEL: define {{[^@]+}}@main // CHECK21-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK21-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp index e83ba0ff9082f..5bc78062c8309 100644 --- a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp @@ -388,7 +388,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -647,13 +647,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -921,7 +914,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1178,13 +1171,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1309,14 +1295,14 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: () #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined) // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1403,10 +1389,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp index 424985dea537c..9e9306a5e749f 100644 --- a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp @@ -827,8 +827,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -947,8 +947,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1069,8 +1069,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1212,8 +1212,8 @@ int main() { // CHECK1-NEXT: store ptr [[AND_VAR1]], ptr [[TMP13]], align 8 // CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -1379,8 +1379,8 @@ int main() { // CHECK1-NEXT: store ptr [[OR_VAR1]], ptr [[TMP13]], align 8 // CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -1537,8 +1537,8 @@ int main() { // CHECK1-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1657,8 +1657,8 @@ int main() { // CHECK1-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1777,8 +1777,8 @@ int main() { // CHECK1-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1906,8 +1906,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -2053,8 +2053,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -2724,8 +2724,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -2853,8 +2853,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -2984,8 +2984,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -3137,8 +3137,8 @@ int main() { // CHECK1-NEXT: store ptr [[AND_VAR1]], ptr [[TMP15]], align 8 // CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -3314,8 +3314,8 @@ int main() { // CHECK1-NEXT: store ptr [[OR_VAR1]], ptr [[TMP15]], align 8 // CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -3481,8 +3481,8 @@ int main() { // CHECK1-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -3610,8 +3610,8 @@ int main() { // CHECK1-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -3739,8 +3739,8 @@ int main() { // CHECK1-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -3880,8 +3880,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP17]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP18]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -4039,8 +4039,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP17]], align 8 // CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP18]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -4096,13 +4096,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -4662,8 +4655,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -4782,8 +4775,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -4904,8 +4897,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -5047,8 +5040,8 @@ int main() { // CHECK3-NEXT: store ptr [[AND_VAR1]], ptr [[TMP13]], align 4 // CHECK3-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -5214,8 +5207,8 @@ int main() { // CHECK3-NEXT: store ptr [[OR_VAR1]], ptr [[TMP13]], align 4 // CHECK3-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -5372,8 +5365,8 @@ int main() { // CHECK3-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -5492,8 +5485,8 @@ int main() { // CHECK3-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -5612,8 +5605,8 @@ int main() { // CHECK3-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -5741,8 +5734,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -5888,8 +5881,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -6559,8 +6552,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -6687,8 +6680,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -6817,8 +6810,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -6969,8 +6962,8 @@ int main() { // CHECK3-NEXT: store ptr [[AND_VAR1]], ptr [[TMP15]], align 4 // CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -7145,8 +7138,8 @@ int main() { // CHECK3-NEXT: store ptr [[OR_VAR1]], ptr [[TMP15]], align 4 // CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -7311,8 +7304,8 @@ int main() { // CHECK3-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -7439,8 +7432,8 @@ int main() { // CHECK3-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -7567,8 +7560,8 @@ int main() { // CHECK3-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -7706,8 +7699,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP17]], align 4 // CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP18]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -7863,8 +7856,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP17]], align 4 // CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP18]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -7920,13 +7913,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -8040,8 +8026,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -8164,8 +8150,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l112.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -8290,8 +8276,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -8445,8 +8431,8 @@ int main() { // CHECK9-NEXT: store ptr [[AND_VAR1]], ptr [[TMP14]], align 8 // CHECK9-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l133.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -8624,8 +8610,8 @@ int main() { // CHECK9-NEXT: store ptr [[OR_VAR1]], ptr [[TMP14]], align 8 // CHECK9-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l144.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP0]], align 1 @@ -8794,8 +8780,8 @@ int main() { // CHECK9-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -8926,8 +8912,8 @@ int main() { // CHECK9-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -9058,8 +9044,8 @@ int main() { // CHECK9-NEXT: store ptr [[BIT_VAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l177.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -9191,8 +9177,8 @@ int main() { // CHECK9-NEXT: store ptr [[MAX_VAR1]], ptr [[TMP15]], align 8 // CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l188.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -9342,8 +9328,8 @@ int main() { // CHECK9-NEXT: store ptr [[MIN_VAR1]], ptr [[TMP15]], align 8 // CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l199.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -9398,10 +9384,3 @@ int main() { // CHECK9-NEXT: store i32 [[COND]], ptr [[TMP7]], align 4 // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp index 67279d431fcb3..5091d41eec4f3 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp @@ -785,62 +785,62 @@ int bar(int n){ // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) // CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) -// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26 -// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META26:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META26]] // CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !26 -// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !26 -// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !26 -// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !26 +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META26]] +// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK1-NEXT: [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0 -// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26 +// CHECK1-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]] // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK1-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias !26 +// CHECK1-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias [[META26]] // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias !26 +// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias !26 +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias !26 +// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias !26 +// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK1-NEXT: store ptr null, ptr [[TMP25]], align 8, !noalias !26 +// CHECK1-NEXT: store ptr null, ptr [[TMP25]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK1-NEXT: store ptr null, ptr [[TMP26]], align 8, !noalias !26 +// CHECK1-NEXT: store ptr null, ptr [[TMP26]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK1-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias !26 +// CHECK1-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK1-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias !26 +// CHECK1-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK1-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !26 +// CHECK1-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias [[META26]] // CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !26 +// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias [[META26]] // CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK1-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias !26 +// CHECK1-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias [[META26]] // CHECK1-NEXT: [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK1-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK1-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK1: omp_offload.failed.i: // CHECK1-NEXT: [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK1-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !26 -// CHECK1-NEXT: [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !26 +// CHECK1-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias [[META26]] +// CHECK1-NEXT: [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK1-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !26 -// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !26 +// CHECK1-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META26]] +// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META26]] // CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK1-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !26 -// CHECK1-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !26 +// CHECK1-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META26]] +// CHECK1-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias [[META26]] // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i64 [[TMP35]], i64 [[TMP37]], i64 [[TMP39]]) #[[ATTR3]] // CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK1: .omp_outlined..exit: @@ -2057,13 +2057,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3fooi // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -2556,62 +2549,62 @@ int bar(int n){ // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) // CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) -// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META27:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META27]] // CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !27 +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK3-NEXT: [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0 -// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !27 +// CHECK3-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK3-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias !27 +// CHECK3-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias !27 +// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias !27 +// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias !27 +// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias !27 +// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK3-NEXT: store ptr null, ptr [[TMP25]], align 4, !noalias !27 +// CHECK3-NEXT: store ptr null, ptr [[TMP25]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK3-NEXT: store ptr null, ptr [[TMP26]], align 4, !noalias !27 +// CHECK3-NEXT: store ptr null, ptr [[TMP26]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK3-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias !27 +// CHECK3-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias [[META27]] // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK3-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias !27 +// CHECK3-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias [[META27]] // CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK3-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !27 +// CHECK3-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !27 +// CHECK3-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK3-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias !27 +// CHECK3-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK3-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK3-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK3: omp_offload.failed.i: // CHECK3-NEXT: [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK3-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !27 -// CHECK3-NEXT: [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !27 +// CHECK3-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias [[META27]] +// CHECK3-NEXT: [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK3-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27 +// CHECK3-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META27]] // CHECK3-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK3-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27 -// CHECK3-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27 +// CHECK3-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META27]] +// CHECK3-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META27]] // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i32 [[TMP35]], i32 [[TMP37]], i32 [[TMP39]]) #[[ATTR3]] // CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK3: .omp_outlined..exit: @@ -3828,13 +3821,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z3fooi // CHECK5-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -4327,62 +4313,62 @@ int bar(int n){ // CHECK5-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) // CHECK5-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]]) // CHECK5-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) -// CHECK5-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26 -// CHECK5-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26 +// CHECK5-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META26:![0-9]+]] +// CHECK5-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META26]] // CHECK5-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK5-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !26 -// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !26 -// CHECK5-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !26 -// CHECK5-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !26 +// CHECK5-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META26]] +// CHECK5-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK5-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK5-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK5-NEXT: [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0 -// CHECK5-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26 +// CHECK5-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]] // CHECK5-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK5-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias !26 +// CHECK5-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias [[META26]] // CHECK5-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK5-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias !26 +// CHECK5-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK5-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias !26 +// CHECK5-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK5-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias !26 +// CHECK5-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK5-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias !26 +// CHECK5-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK5-NEXT: store ptr null, ptr [[TMP25]], align 8, !noalias !26 +// CHECK5-NEXT: store ptr null, ptr [[TMP25]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK5-NEXT: store ptr null, ptr [[TMP26]], align 8, !noalias !26 +// CHECK5-NEXT: store ptr null, ptr [[TMP26]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK5-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias !26 +// CHECK5-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK5-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias !26 +// CHECK5-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK5-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !26 +// CHECK5-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias [[META26]] // CHECK5-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK5-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !26 +// CHECK5-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias [[META26]] // CHECK5-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK5-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias !26 +// CHECK5-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias [[META26]] // CHECK5-NEXT: [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK5-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK5-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK5: omp_offload.failed.i: // CHECK5-NEXT: [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK5-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !26 -// CHECK5-NEXT: [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !26 +// CHECK5-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias [[META26]] +// CHECK5-NEXT: [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK5-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !26 -// CHECK5-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !26 +// CHECK5-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META26]] +// CHECK5-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META26]] // CHECK5-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK5-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !26 -// CHECK5-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !26 +// CHECK5-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META26]] +// CHECK5-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias [[META26]] // CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i64 [[TMP35]], i64 [[TMP37]], i64 [[TMP39]]) #[[ATTR3]] // CHECK5-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK5: .omp_outlined..exit: @@ -4448,10 +4434,10 @@ int bar(int n){ // CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK5-NEXT: store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal !27 -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal !27 +// CHECK5-NEXT: store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal [[META27:![0-9]+]] +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal [[META27]] // CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK5-NEXT: store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal !27 +// CHECK5-NEXT: store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal [[META27]] // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: // CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -5668,13 +5654,6 @@ int bar(int n){ // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR4]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@_Z3fooi // CHECK7-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -6167,62 +6146,62 @@ int bar(int n){ // CHECK7-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) // CHECK7-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]]) // CHECK7-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) -// CHECK7-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27 +// CHECK7-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META27:![0-9]+]] +// CHECK7-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META27]] // CHECK7-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]] -// CHECK7-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !27 +// CHECK7-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1 // CHECK7-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2 // CHECK7-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK7-NEXT: [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0 -// CHECK7-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !27 +// CHECK7-NEXT: store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 -// CHECK7-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias !27 +// CHECK7-NEXT: store i32 3, ptr [[TMP20]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 -// CHECK7-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias !27 +// CHECK7-NEXT: store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 -// CHECK7-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias !27 +// CHECK7-NEXT: store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 -// CHECK7-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias !27 +// CHECK7-NEXT: store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 -// CHECK7-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias !27 +// CHECK7-NEXT: store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 -// CHECK7-NEXT: store ptr null, ptr [[TMP25]], align 4, !noalias !27 +// CHECK7-NEXT: store ptr null, ptr [[TMP25]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 -// CHECK7-NEXT: store ptr null, ptr [[TMP26]], align 4, !noalias !27 +// CHECK7-NEXT: store ptr null, ptr [[TMP26]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 -// CHECK7-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias !27 +// CHECK7-NEXT: store i64 10, ptr [[TMP27]], align 8, !noalias [[META27]] // CHECK7-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 -// CHECK7-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias !27 +// CHECK7-NEXT: store i64 1, ptr [[TMP28]], align 8, !noalias [[META27]] // CHECK7-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 -// CHECK7-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !27 +// CHECK7-NEXT: store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 -// CHECK7-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !27 +// CHECK7-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 -// CHECK7-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias !27 +// CHECK7-NEXT: store i32 0, ptr [[TMP31]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]]) // CHECK7-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK7-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] // CHECK7: omp_offload.failed.i: // CHECK7-NEXT: [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2 -// CHECK7-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !27 -// CHECK7-NEXT: [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !27 +// CHECK7-NEXT: store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias [[META27]] +// CHECK7-NEXT: [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4 -// CHECK7-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27 +// CHECK7-NEXT: store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META27]] // CHECK7-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4 -// CHECK7-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27 -// CHECK7-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27 +// CHECK7-NEXT: store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META27]] +// CHECK7-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias [[META27]] // CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i32 [[TMP35]], i32 [[TMP37]], i32 [[TMP39]]) #[[ATTR3]] // CHECK7-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] // CHECK7: .omp_outlined..exit: @@ -6288,10 +6267,10 @@ int bar(int n){ // CHECK7-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK7-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1 // CHECK7-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] -// CHECK7-NEXT: store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal !28 -// CHECK7-NEXT: [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal !28 +// CHECK7-NEXT: store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal [[META28:![0-9]+]] +// CHECK7-NEXT: [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal [[META28]] // CHECK7-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1 -// CHECK7-NEXT: store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal !28 +// CHECK7-NEXT: store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal [[META28]] // CHECK7-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK7: omp.body.continue: // CHECK7-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] @@ -7508,13 +7487,6 @@ int bar(int n){ // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR4]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z3fooi // CHECK9-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -8634,10 +8606,10 @@ int bar(int n){ // CHECK13-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV7]], align 4 // CHECK13-NEXT: [[MUL13:%.*]] = mul nsw i32 [[TMP17]], 1 // CHECK13-NEXT: [[ADD14:%.*]] = add nsw i32 0, [[MUL13]] -// CHECK13-NEXT: store i32 [[ADD14]], ptr [[A8]], align 4, !nontemporal !7 -// CHECK13-NEXT: [[TMP18:%.*]] = load i32, ptr [[A8]], align 4, !nontemporal !7 +// CHECK13-NEXT: store i32 [[ADD14]], ptr [[A8]], align 4, !nontemporal [[META7:![0-9]+]] +// CHECK13-NEXT: [[TMP18:%.*]] = load i32, ptr [[A8]], align 4, !nontemporal [[META7]] // CHECK13-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP18]], 1 -// CHECK13-NEXT: store i32 [[ADD15]], ptr [[A8]], align 4, !nontemporal !7 +// CHECK13-NEXT: store i32 [[ADD15]], ptr [[A8]], align 4, !nontemporal [[META7]] // CHECK13-NEXT: br label [[OMP_BODY_CONTINUE16:%.*]] // CHECK13: omp.body.continue16: // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC17:%.*]] @@ -9190,10 +9162,10 @@ int bar(int n){ // CHECK15-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV7]], align 4 // CHECK15-NEXT: [[MUL13:%.*]] = mul nsw i32 [[TMP15]], 1 // CHECK15-NEXT: [[ADD14:%.*]] = add nsw i32 0, [[MUL13]] -// CHECK15-NEXT: store i32 [[ADD14]], ptr [[A8]], align 4, !nontemporal !8 -// CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[A8]], align 4, !nontemporal !8 +// CHECK15-NEXT: store i32 [[ADD14]], ptr [[A8]], align 4, !nontemporal [[META8:![0-9]+]] +// CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[A8]], align 4, !nontemporal [[META8]] // CHECK15-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP16]], 1 -// CHECK15-NEXT: store i32 [[ADD15]], ptr [[A8]], align 4, !nontemporal !8 +// CHECK15-NEXT: store i32 [[ADD15]], ptr [[A8]], align 4, !nontemporal [[META8]] // CHECK15-NEXT: br label [[OMP_BODY_CONTINUE16:%.*]] // CHECK15: omp.body.continue16: // CHECK15-NEXT: br label [[OMP_INNER_FOR_INC17:%.*]] diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp index 2b198c30eb126..9814448d5a6b0 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp @@ -210,10 +210,10 @@ // TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function. -// CHECK: @llvm.global_ctors = appending global [4 x { i32, ptr, ptr }] [ +// CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ // CHECK-SAME: { i32, ptr, ptr } { i32 500, ptr [[P500:@[^,]+]], ptr null }, // CHECK-SAME: { i32, ptr, ptr } { i32 501, ptr [[P501:@[^,]+]], ptr null }, -// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null }, +// CHECK-SAME: { i32, ptr, ptr } { i32 65535, ptr [[PMAX:@[^,]+]], ptr null } // CHECK-NTARGET: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [ diff --git a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp index 3a303fe951cf4..ac7fc7c1acf91 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp @@ -263,13 +263,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -437,13 +430,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1072,13 +1058,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1562,13 +1541,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp index ffe85c1624b56..fb7eed632ba10 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp @@ -65,10 +65,6 @@ // TCHECK: @{{.+}} = {{.*}}constant [[ENTTY]] // TCHECK-NOT: @{{.+}} = weak constant [[ENTTY]] -// Check target registration is registered as a Ctor. -// CHECK: appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @.omp_offloading.requires_reg, ptr null }] - - template struct TT{ tx X; diff --git a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp index 7b854870b9429..867ec89421402 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp @@ -555,13 +555,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -992,13 +985,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2344,13 +2330,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -3445,13 +3424,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp index f9bfa069a6e81..425bcaf6c3f3d 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp @@ -535,7 +535,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -955,13 +955,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1356,7 +1349,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1774,13 +1767,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1908,7 +1894,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR1]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2228,7 +2214,7 @@ int main() { // // // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK7-SAME: () #[[ATTR1]] comdat { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK7-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2545,7 +2531,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74 -// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 @@ -2573,7 +2559,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2669,13 +2655,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp index 9704ba49fb84f..4c9e83afb3bb3 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp @@ -301,13 +301,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -471,13 +464,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -852,7 +838,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1229,13 +1215,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1572,7 +1551,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1947,13 +1926,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2062,14 +2034,14 @@ int main() { // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i64 4, i1 false) // CHECK13-NEXT: [[TMP16:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK13-NEXT: store i32 [[TMP16]], ptr @_ZZ4mainE4svar, align 4 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done13: @@ -2081,11 +2053,11 @@ int main() { // CHECK13: arraydestroy.body16: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK13: arraydestroy.done20: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP19]] // @@ -2119,12 +2091,12 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2223,14 +2195,14 @@ int main() { // CHECK13: omp.arraycpy.done11: // CHECK13-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP6]], align 8 // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i64 4, i1 false) -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK13-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP16]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done13: @@ -2241,11 +2213,11 @@ int main() { // CHECK13: arraydestroy.body15: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP17]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAY_BEGIN14]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15]] // CHECK13: arraydestroy.done19: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP18:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP18]] // @@ -2313,7 +2285,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -2457,14 +2429,14 @@ int main() { // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i32 4, i1 false) // CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK15-NEXT: store i32 [[TMP16]], ptr @_ZZ4mainE4svar, align 4 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE10]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done12: @@ -2476,11 +2448,11 @@ int main() { // CHECK15: arraydestroy.body15: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAY_BEGIN14]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15]] // CHECK15: arraydestroy.done19: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP19]] // @@ -2514,12 +2486,12 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2616,14 +2588,14 @@ int main() { // CHECK15: omp.arraycpy.done10: // CHECK15-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP6]], align 4 // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i32 4, i1 false) -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK15-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP16]], [[OMP_ARRAYCPY_DONE10]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done12: @@ -2634,11 +2606,11 @@ int main() { // CHECK15: arraydestroy.body14: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST15:%.*]] = phi ptr [ [[TMP17]], [[ARRAYDESTROY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT16:%.*]], [[ARRAYDESTROY_BODY14]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT16]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST15]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT16]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT16]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE17:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT16]], [[ARRAY_BEGIN13]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE17]], label [[ARRAYDESTROY_DONE18:%.*]], label [[ARRAYDESTROY_BODY14]] // CHECK15: arraydestroy.done18: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP18:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP18]] // @@ -2706,7 +2678,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp index bf38e4bace5aa..6cfec0010d00c 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp @@ -661,13 +661,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1206,13 +1199,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2191,13 +2177,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp index 63ad2c3bdeb8c..1481b4fe85fde 100644 --- a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp @@ -221,8 +221,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -404,8 +404,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -443,13 +443,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -593,8 +586,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -776,8 +769,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -815,13 +808,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1122,8 +1108,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP15]], align 8 // CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1161,13 +1147,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp index ad84510e7f8ab..26c9e4713f3e6 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp @@ -1174,13 +1174,6 @@ int target_teams_fun(int *g){ // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK2-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK2-NEXT: ret void -// -// // CHECK4-LABEL: define {{[^@]+}}@_Z16target_teams_funPi // CHECK4-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK4-NEXT: entry: @@ -1796,13 +1789,6 @@ int target_teams_fun(int *g){ // CHECK4-NEXT: ret void // // -// CHECK4-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK4-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK4-NEXT: entry: -// CHECK4-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK4-NEXT: ret void -// -// // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51 // CHECK10-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK10-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp index 0a6ae1ad405c2..6a30ef7f6eb8f 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp @@ -335,13 +335,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -572,13 +565,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1242,13 +1228,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1906,10 +1885,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp index b17b252baaeea..c44699ee12fcf 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp @@ -3125,13 +3125,6 @@ int foo(int n) { // // // -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// -// // // // diff --git a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp index 4291a405e4baf..c19323c35b4e1 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp @@ -1487,10 +1487,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp index fb93d58b6bd17..195989692dc39 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp @@ -198,10 +198,3 @@ void gtid_test() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] section ".text.startup" { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp index 7cc148f4c4ee7..987c12adc6f66 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp @@ -550,7 +550,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -913,13 +913,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1284,7 +1277,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1641,13 +1634,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1772,14 +1758,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104 -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR4:[0-9]+]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1846,7 +1832,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1944,13 +1930,6 @@ int main() { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR0]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124 // CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp index fa48c02a65647..bfa00ee7a0f4b 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp @@ -222,8 +222,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -321,8 +321,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -512,8 +512,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -611,8 +611,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -669,13 +669,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -806,8 +799,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -903,8 +896,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1092,8 +1085,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1189,8 +1182,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1247,13 +1240,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1341,8 +1327,8 @@ int main() { // CHECK5-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1444,8 +1430,8 @@ int main() { // CHECK5-NEXT: store ptr [[SIVAR2]], ptr [[TMP15]], align 8 // CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1501,10 +1487,3 @@ int main() { // CHECK5-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 // CHECK5-NEXT: ret void // -// -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp index 64f0dced135f8..f945dc9b21d4e 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp @@ -473,10 +473,3 @@ void foo() { // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]]) // CHECK-NEXT: ret void // -// -// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK-NEXT: ret void -// diff --git a/clang/test/OpenMP/target_teams_map_codegen.cpp b/clang/test/OpenMP/target_teams_map_codegen.cpp index c535c495b2051..9ccf74514691b 100644 --- a/clang/test/OpenMP/target_teams_map_codegen.cpp +++ b/clang/test/OpenMP/target_teams_map_codegen.cpp @@ -346,8 +346,8 @@ void mapInt128() { // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 2, i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -804,8 +804,8 @@ void mapInt128() { // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i64 99 @@ -924,8 +924,8 @@ void mapInt128() { // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i64 99 @@ -1161,8 +1161,8 @@ void mapInt128() { // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP7:%.*]] = load i128, ptr [[TMP1]], align 16 @@ -1253,8 +1253,8 @@ void mapInt128() { // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP7:%.*]] = load i128, ptr [[TMP1]], align 16 @@ -1303,13 +1303,6 @@ void mapInt128() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z14mapWithPrivatev // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1580,8 +1573,8 @@ void mapInt128() { // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 2, i32 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -2038,8 +2031,8 @@ void mapInt128() { // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i32 99 @@ -2158,8 +2151,8 @@ void mapInt128() { // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i32 99 @@ -2230,13 +2223,6 @@ void mapInt128() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27 // CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2336,8 +2322,8 @@ void mapInt128() { // CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 2, i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -2523,8 +2509,8 @@ void mapInt128() { // CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i64 99 @@ -2645,8 +2631,8 @@ void mapInt128() { // CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i64 99 @@ -2761,8 +2747,8 @@ void mapInt128() { // CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP7:%.*]] = load i128, ptr [[TMP1]], align 16 @@ -2855,8 +2841,8 @@ void mapInt128() { // CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK5-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK5-NEXT: ] // CHECK5: .omp.reduction.case1: // CHECK5-NEXT: [[TMP7:%.*]] = load i128, ptr [[TMP1]], align 16 @@ -3004,8 +2990,8 @@ void mapInt128() { // CHECK7-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK7-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 2, i32 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK7-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK7-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK7-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK7-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK7-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK7-NEXT: ] // CHECK7: .omp.reduction.case1: // CHECK7-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -3191,8 +3177,8 @@ void mapInt128() { // CHECK7-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK7-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK7-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK7-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK7-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK7-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK7-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK7-NEXT: ] // CHECK7: .omp.reduction.case1: // CHECK7-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i32 99 @@ -3313,8 +3299,8 @@ void mapInt128() { // CHECK7-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK7-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP5]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK7-NEXT: switch i32 [[TMP6]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK7-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK7-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK7-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK7-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK7-NEXT: ] // CHECK7: .omp.reduction.case1: // CHECK7-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP1]], i32 99 diff --git a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp index 210cfb922b5e2..e5618aefaf3cb 100644 --- a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp +++ b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp @@ -752,13 +752,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3bari // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1337,13 +1330,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp index bafab12a529d1..b6593b7e53685 100644 --- a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp +++ b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp @@ -771,13 +771,6 @@ int bar(int n){ // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z3bari // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1375,13 +1368,6 @@ int bar(int n){ // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104 // CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: diff --git a/clang/test/OpenMP/teams_codegen.cpp b/clang/test/OpenMP/teams_codegen.cpp index 914c0f275b3d5..4aab9dae2d6ad 100644 --- a/clang/test/OpenMP/teams_codegen.cpp +++ b/clang/test/OpenMP/teams_codegen.cpp @@ -901,13 +901,6 @@ void foo() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z27teams_argument_global_locali // CHECK3-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1511,13 +1504,6 @@ void foo() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z18teams_template_argv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1745,13 +1731,6 @@ void foo() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z18teams_template_argv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1979,13 +1958,6 @@ void foo() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK17-SAME: () #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -2192,13 +2164,6 @@ void foo() { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -2405,13 +2370,6 @@ void foo() { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR3:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216 // CHECK25-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK25-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_codegen.cpp b/clang/test/OpenMP/teams_distribute_codegen.cpp index 6856482539ea7..0bfadcf70d9bc 100644 --- a/clang/test/OpenMP/teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_codegen.cpp @@ -463,7 +463,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l35 -// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 @@ -566,13 +566,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -860,7 +853,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l35 -// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 @@ -962,13 +955,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1059,7 +1045,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 0 @@ -1085,7 +1071,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1179,13 +1165,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1276,7 +1255,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 0 @@ -1302,7 +1281,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1395,13 +1374,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK17-SAME: () #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -1460,7 +1432,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK17-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1480,7 +1452,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1547,13 +1519,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -1612,7 +1577,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK19-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1632,7 +1597,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1698,13 +1663,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK25-LABEL: define {{[^@]+}}@main // CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK25-NEXT: entry: @@ -1801,7 +1759,7 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK25-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1829,7 +1787,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1924,7 +1882,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK25-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2000,14 +1958,14 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK25-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR4]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR3]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: ret i32 0 // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151 -// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[TE_ADDR:%.*]] = alloca i64, align 8 // CHECK25-NEXT: [[TH_ADDR:%.*]] = alloca i64, align 8 @@ -2025,7 +1983,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2091,13 +2049,6 @@ int main (int argc, char **argv) { // CHECK25-NEXT: ret void // // -// CHECK25-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK25-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK25-NEXT: entry: -// CHECK25-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK25-NEXT: ret void -// -// // CHECK27-LABEL: define {{[^@]+}}@main // CHECK27-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK27-NEXT: entry: @@ -2194,7 +2145,7 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK27-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2222,7 +2173,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2316,7 +2267,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2392,14 +2343,14 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK27-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR4]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR3]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: ret i32 0 // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151 -// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[TE_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[TH_ADDR:%.*]] = alloca i32, align 4 @@ -2417,7 +2368,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2481,10 +2432,3 @@ int main (int argc, char **argv) { // CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) // CHECK27-NEXT: ret void // -// -// CHECK27-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK27-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK27-NEXT: entry: -// CHECK27-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK27-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp index fccd4268a405f..178af730a8d4a 100644 --- a/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp @@ -156,7 +156,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -177,7 +177,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -258,13 +258,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -324,7 +317,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -345,7 +338,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -424,13 +417,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -560,7 +546,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l82(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l82(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -593,7 +579,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l82.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -741,7 +727,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -790,7 +776,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret i32 0 @@ -807,7 +793,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -887,13 +873,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1022,7 +1001,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 // CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l82(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l82(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1055,7 +1034,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l82.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1201,7 +1180,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1250,7 +1229,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret i32 0 @@ -1267,7 +1246,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1344,10 +1323,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp index 40b03586adf9b..f21c7e9be9205 100644 --- a/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp @@ -199,7 +199,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -241,7 +241,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -283,7 +283,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK1-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK1: omp_offload.failed16: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK1: omp_offload.cont17: // CHECK1-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -303,7 +303,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -381,7 +381,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -459,7 +459,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -543,13 +543,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -618,7 +611,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -660,7 +653,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK3-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -702,7 +695,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK3-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK3: omp_offload.failed16: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK3: omp_offload.cont17: // CHECK3-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -722,7 +715,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -799,7 +792,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -876,7 +869,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -959,13 +952,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1080,7 +1066,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -1151,7 +1137,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK9-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK9: omp_offload.failed16: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK9: omp_offload.cont17: // CHECK9-NEXT: [[TMP73:%.*]] = load i32, ptr [[N]], align 4 @@ -1222,7 +1208,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP107:%.*]] = icmp ne i32 [[TMP106]], 0 // CHECK9-NEXT: br i1 [[TMP107]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK9: omp_offload.failed31: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i64 [[TMP74]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i64 [[TMP74]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK9: omp_offload.cont32: // CHECK9-NEXT: [[TMP108:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1250,7 +1236,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1360,7 +1346,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1477,7 +1463,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1592,7 +1578,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -1650,7 +1636,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -1691,7 +1677,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK9: omp_offload.failed6: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK9: omp_offload.cont7: // CHECK9-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0 @@ -1732,7 +1718,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK9-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK9: omp_offload.failed13: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK9: omp_offload.cont14: // CHECK9-NEXT: ret i32 0 @@ -1749,7 +1735,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1826,7 +1812,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1903,7 +1889,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1986,13 +1972,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2107,7 +2086,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -2179,7 +2158,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK11-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK11: omp_offload.failed16: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK11: omp_offload.cont17: // CHECK11-NEXT: [[TMP74:%.*]] = load i32, ptr [[N]], align 4 @@ -2251,7 +2230,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK11-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK11: omp_offload.failed31: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i32 [[TMP75]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i32 [[TMP75]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK11: omp_offload.cont32: // CHECK11-NEXT: [[TMP110:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2279,7 +2258,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2388,7 +2367,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2504,7 +2483,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2618,7 +2597,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2676,7 +2655,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -2717,7 +2696,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK11: omp_offload.failed6: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK11: omp_offload.cont7: // CHECK11-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0 @@ -2758,7 +2737,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK11-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK11: omp_offload.failed13: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK11: omp_offload.cont14: // CHECK11-NEXT: ret i32 0 @@ -2775,7 +2754,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2851,7 +2830,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2927,7 +2906,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3008,10 +2987,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp index c65348521226f..57175e72b79f8 100644 --- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp @@ -371,7 +371,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -531,7 +531,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -730,7 +730,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -946,13 +946,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1182,7 +1175,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1340,7 +1333,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1539,7 +1532,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1753,13 +1746,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1884,7 +1870,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca i64, align 8 @@ -1912,7 +1898,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2000,10 +1986,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp index 4e6484af530c0..5ee4a5ce9e29d 100644 --- a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp @@ -174,7 +174,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -285,13 +285,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -340,7 +333,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -451,13 +444,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -561,7 +547,7 @@ int main() { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -572,11 +558,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP41]] // @@ -628,7 +614,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -760,14 +746,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP26]], ptr [[TMP4]], align 4 // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done15: @@ -780,12 +766,12 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -872,7 +858,7 @@ int main() { // CHECK9-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK9-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -882,11 +868,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP36]] // @@ -970,7 +956,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1096,14 +1082,14 @@ int main() { // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP24]], i64 4, i1 false) // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done14: @@ -1116,7 +1102,7 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -1154,13 +1140,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1264,7 +1243,7 @@ int main() { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -1275,11 +1254,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP41]] // @@ -1331,7 +1310,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1461,14 +1440,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP26]], ptr [[TMP4]], align 4 // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done14: @@ -1481,12 +1460,12 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1573,7 +1552,7 @@ int main() { // CHECK11-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK11-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -1583,11 +1562,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP36]] // @@ -1671,7 +1650,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1795,14 +1774,14 @@ int main() { // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP24]], i32 4, i1 false) // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done13: @@ -1815,7 +1794,7 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -1852,10 +1831,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp index 54dfac4922d45..6dcfa4f6f2abc 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp @@ -576,7 +576,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l35 -// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 @@ -777,13 +777,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1180,7 +1173,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l35 -// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 @@ -1376,13 +1369,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1473,7 +1459,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 0 @@ -1499,7 +1485,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1590,7 +1576,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1694,13 +1680,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1791,7 +1770,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 0 @@ -1817,7 +1796,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1906,7 +1885,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l73.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2007,13 +1986,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK17-SAME: () #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -2072,7 +2044,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK17-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2092,7 +2064,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2155,7 +2127,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2232,13 +2204,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -2297,7 +2262,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK19-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2317,7 +2282,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2378,7 +2343,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l109.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2452,13 +2417,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK25-LABEL: define {{[^@]+}}@main // CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK25-NEXT: entry: @@ -2555,7 +2513,7 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK25-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2583,7 +2541,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2674,7 +2632,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162.omp_outlined.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2779,7 +2737,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK25-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2855,14 +2813,14 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK25-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR4]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR3]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: ret i32 0 // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151 -// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[TE_ADDR:%.*]] = alloca i64, align 8 // CHECK25-NEXT: [[TH_ADDR:%.*]] = alloca i64, align 8 @@ -2880,7 +2838,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2943,7 +2901,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151.omp_outlined.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3019,13 +2977,6 @@ int main (int argc, char **argv) { // CHECK25-NEXT: ret void // // -// CHECK25-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK25-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK25-NEXT: entry: -// CHECK25-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK25-NEXT: ret void -// -// // CHECK27-LABEL: define {{[^@]+}}@main // CHECK27-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK27-NEXT: entry: @@ -3122,7 +3073,7 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK27-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3150,7 +3101,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3239,7 +3190,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l162.omp_outlined.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3341,7 +3292,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -3417,14 +3368,14 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK27-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR4]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR3]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: ret i32 0 // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151 -// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[TE_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[TH_ADDR:%.*]] = alloca i32, align 4 @@ -3442,7 +3393,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3503,7 +3454,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l151.omp_outlined.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3575,10 +3526,3 @@ int main (int argc, char **argv) { // CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]]) // CHECK27-NEXT: ret void // -// -// CHECK27-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK27-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK27-NEXT: entry: -// CHECK27-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK27-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp index e909bccab23f9..60c54f7bc0b43 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp @@ -161,7 +161,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -182,7 +182,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -247,7 +247,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -338,13 +338,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -404,7 +397,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -425,7 +418,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -488,7 +481,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -575,13 +568,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -711,7 +697,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -744,7 +730,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -856,7 +842,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1012,7 +998,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1061,7 +1047,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret i32 0 @@ -1078,7 +1064,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1143,7 +1129,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1233,13 +1219,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1368,7 +1347,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 // CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1401,7 +1380,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1515,7 +1494,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1671,7 +1650,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1720,7 +1699,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret i32 0 @@ -1737,7 +1716,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1800,7 +1779,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1885,10 +1864,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp index 5630f3b12fd6e..3455597f90900 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp @@ -152,7 +152,7 @@ int main() { // CHECK1-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0 // CHECK1-NEXT: br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i64 [[TMP2]], ptr [[A]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i64 [[TMP2]], ptr [[A]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -172,7 +172,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -238,7 +238,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -319,7 +319,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 // CHECK1-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 @@ -376,7 +376,7 @@ int main() { // CHECK1-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0 // CHECK1-NEXT: br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i64 [[TMP2]], ptr [[A]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i64 [[TMP2]], ptr [[A]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -395,7 +395,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -461,7 +461,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -542,18 +542,11 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZTW1x -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK1-NEXT: [[TMP1:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @x) // CHECK1-NEXT: ret ptr [[TMP1]] // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR6]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -614,7 +607,7 @@ int main() { // CHECK3-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0 // CHECK3-NEXT: br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i32 [[TMP2]], ptr [[A]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i32 [[TMP2]], ptr [[A]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -634,7 +627,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -698,7 +691,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -776,7 +769,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR4:[0-9]+]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[A:%.*]] = alloca [2 x i32], align 4 // CHECK3-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 @@ -833,7 +826,7 @@ int main() { // CHECK3-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0 // CHECK3-NEXT: br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i32 [[TMP2]], ptr [[A]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34(i32 [[TMP2]], ptr [[A]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret i32 0 @@ -852,7 +845,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -916,7 +909,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l34.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -994,18 +987,11 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_ZTW1x -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK3-NEXT: [[TMP1:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @x) // CHECK3-NEXT: ret ptr [[TMP1]] // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1032,7 +1018,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l46.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1098,7 +1084,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l46.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1187,14 +1173,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@_ZTW1x -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: [[TMP1:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @x) // CHECK9-NEXT: ret ptr [[TMP1]] // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp index f446746aef20f..f324d2c7ac90c 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp @@ -211,7 +211,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -253,7 +253,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -295,7 +295,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK1-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK1: omp_offload.failed16: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK1: omp_offload.cont17: // CHECK1-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -315,7 +315,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -378,7 +378,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -466,7 +466,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -529,7 +529,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -617,7 +617,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -700,7 +700,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -777,13 +777,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -852,7 +845,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -894,7 +887,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK3-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -936,7 +929,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK3-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK3: omp_offload.failed16: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK3: omp_offload.cont17: // CHECK3-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -956,7 +949,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1017,7 +1010,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1102,7 +1095,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1163,7 +1156,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1248,7 +1241,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1329,7 +1322,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1403,13 +1396,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1527,7 +1513,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -1598,7 +1584,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK9-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK9: omp_offload.failed16: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK9: omp_offload.cont17: // CHECK9-NEXT: [[TMP73:%.*]] = load i32, ptr [[M]], align 4 @@ -1678,7 +1664,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP112:%.*]] = icmp ne i32 [[TMP111]], 0 // CHECK9-NEXT: br i1 [[TMP112]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK9: omp_offload.failed31: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK9: omp_offload.cont32: // CHECK9-NEXT: [[TMP113:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1706,7 +1692,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1797,7 +1783,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1917,7 +1903,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2008,7 +1994,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2137,7 +2123,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2259,7 +2245,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2366,7 +2352,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2427,7 +2413,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -2468,7 +2454,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK9: omp_offload.failed6: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK9: omp_offload.cont7: // CHECK9-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -2518,7 +2504,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK9-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK9: omp_offload.failed13: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94(i64 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94(i64 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK9: omp_offload.cont14: // CHECK9-NEXT: ret i32 0 @@ -2535,7 +2521,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2598,7 +2584,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2685,7 +2671,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2748,7 +2734,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2844,7 +2830,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2934,7 +2920,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3012,13 +2998,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -3136,7 +3115,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -3208,7 +3187,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK11-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK11: omp_offload.failed16: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK11: omp_offload.cont17: // CHECK11-NEXT: [[TMP74:%.*]] = load i32, ptr [[M]], align 4 @@ -3289,7 +3268,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0 // CHECK11-NEXT: br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK11: omp_offload.failed31: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK11: omp_offload.cont32: // CHECK11-NEXT: [[TMP115:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3317,7 +3296,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3406,7 +3385,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3523,7 +3502,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3612,7 +3591,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l111.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3738,7 +3717,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3858,7 +3837,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3962,7 +3941,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -4023,7 +4002,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -4064,7 +4043,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK11: omp_offload.failed6: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK11: omp_offload.cont7: // CHECK11-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -4114,7 +4093,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK11-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK11: omp_offload.failed13: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94(i32 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94(i32 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK11: omp_offload.cont14: // CHECK11-NEXT: ret i32 0 @@ -4131,7 +4110,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4192,7 +4171,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4276,7 +4255,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4337,7 +4316,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4430,7 +4409,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4518,7 +4497,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l94.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4592,10 +4571,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp index 0bbef272767f0..24c3bc4adfefd 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp @@ -405,7 +405,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -560,7 +560,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -695,7 +695,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -894,7 +894,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1029,7 +1029,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1240,13 +1240,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1476,7 +1469,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1629,7 +1622,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1760,7 +1753,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1959,7 +1952,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2092,7 +2085,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2299,13 +2292,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -2430,7 +2416,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca i64, align 8 @@ -2458,7 +2444,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2539,7 +2525,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2637,10 +2623,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp index dfaac972b68c1..31367697db235 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp @@ -150,7 +150,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -183,7 +183,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret void @@ -197,7 +197,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -257,7 +257,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -334,7 +334,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -382,7 +382,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -399,7 +399,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -470,7 +470,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@main -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 @@ -515,7 +515,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -548,7 +548,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l89() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l89() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr @Arg, align 4 @@ -600,7 +600,7 @@ int main() { // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97(i64 [[TMP31]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97(i64 [[TMP31]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 @@ -616,7 +616,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -676,7 +676,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -754,7 +754,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l89.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -802,7 +802,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l89.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l89.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -819,7 +819,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l89.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -910,7 +910,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -967,7 +967,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: br label [[OMP_IF_END]] // CHECK1: omp_if.end: @@ -986,7 +986,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l97.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1102,7 +1102,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -1135,7 +1135,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 @@ -1187,7 +1187,7 @@ int main() { // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72(i64 [[TMP31]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72(i64 [[TMP31]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: ret i32 0 @@ -1201,7 +1201,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1261,7 +1261,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1339,7 +1339,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1387,7 +1387,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -1404,7 +1404,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1495,7 +1495,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1552,7 +1552,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK1-NEXT: br label [[OMP_IF_END]] // CHECK1: omp_if.end: @@ -1571,7 +1571,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l72.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1640,10 +1640,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp index cf42569ff3fe5..45197d5e296de 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp @@ -212,7 +212,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -309,7 +309,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -430,13 +430,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -485,7 +478,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -580,7 +573,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -699,13 +692,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -809,7 +795,7 @@ int main() { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -820,11 +806,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP41]] // @@ -876,7 +862,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -999,14 +985,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP27]], ptr [[TMP4]], align 4 // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done12: @@ -1014,7 +1000,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1156,14 +1142,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP28]], ptr [[TMP4]], align 4 // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0 // CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done16: @@ -1176,12 +1162,12 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1268,7 +1254,7 @@ int main() { // CHECK9-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK9-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -1278,11 +1264,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP36]] // @@ -1366,7 +1352,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1483,14 +1469,14 @@ int main() { // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP25]], i64 4, i1 false) // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done11: @@ -1498,7 +1484,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1634,14 +1620,14 @@ int main() { // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 [[TMP26]], i64 4, i1 false) // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0 // CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done15: @@ -1654,7 +1640,7 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -1692,13 +1678,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1802,7 +1781,7 @@ int main() { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -1813,11 +1792,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP41]] // @@ -1869,7 +1848,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1990,14 +1969,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP25]], ptr [[TMP4]], align 4 // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done12: @@ -2005,7 +1984,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2143,14 +2122,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP28]], ptr [[TMP4]], align 4 // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done14: @@ -2163,12 +2142,12 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2255,7 +2234,7 @@ int main() { // CHECK11-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK11-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -2265,11 +2244,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP36]] // @@ -2353,7 +2332,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2468,14 +2447,14 @@ int main() { // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP23]], i32 4, i1 false) // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done11: @@ -2483,7 +2462,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2615,14 +2594,14 @@ int main() { // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP6]], ptr align 4 [[TMP26]], i32 4, i1 false) // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done13: @@ -2635,7 +2614,7 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -2672,10 +2651,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp index efa10037a9577..c805d739cf9c2 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp @@ -93,7 +93,7 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK1-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -126,16 +126,16 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR4:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: lpad: // CHECK1-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: cleanup +// CHECK1-NEXT: cleanup // CHECK1-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK1-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK1-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK1-NEXT: br label [[EH_RESUME:%.*]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -184,21 +184,21 @@ int main() { // CHECK1-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 // CHECK1-NEXT: br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP44:%.*]] = load i8, ptr [[A]], align 1 // CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP44]] to i32 // CHECK1-NEXT: [[CALL6:%.*]] = invoke noundef signext i32 @_Z5tmainIcLi5EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont5: // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK1-NEXT: [[CALL8:%.*]] = invoke noundef signext i32 @_Z5tmainI1SLi1EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont7: // CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK1-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK1-NEXT: ret i32 [[TMP45]] // CHECK1: eh.resume: @@ -242,7 +242,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -303,7 +303,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -358,7 +358,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -375,21 +375,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13:[0-9]+]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7:[0-9]+]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat { -// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] -// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR13]] +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55 -// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 @@ -405,7 +405,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -470,7 +470,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -525,7 +525,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -542,9 +542,9 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // @@ -585,7 +585,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -618,7 +618,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 @@ -663,14 +663,14 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK1-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = zext i8 [[TMP15]] to i32 @@ -705,30 +705,30 @@ int main() { // CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 // CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP33:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP34:%.*]] = extractvalue { ptr, i32 } [[TMP33]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -742,7 +742,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -751,14 +751,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36 -// CHECK1-SAME: () #[[ATTR9:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -819,7 +819,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -874,7 +874,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -891,21 +891,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40 -// CHECK1-SAME: () #[[ATTR10:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -966,7 +966,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1021,7 +1021,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1038,21 +1038,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36 -// CHECK1-SAME: () #[[ATTR11:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1113,7 +1113,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1168,7 +1168,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1185,23 +1185,23 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40 -// CHECK1-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 // CHECK1-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_S:%.*]], align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK1-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: store i8 [[TMP0]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 @@ -1210,14 +1210,14 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP2:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1282,7 +1282,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1337,7 +1337,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1354,19 +1354,12 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR12:[0-9]+]] section ".text.startup" { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: @@ -1387,7 +1380,7 @@ int main() { // CHECK5-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK5-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK5-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK5-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -1420,16 +1413,16 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR4:[0-9]+]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: lpad: // CHECK5-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: cleanup +// CHECK5-NEXT: cleanup // CHECK5-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK5-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK5-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK5-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK5-NEXT: br label [[EH_RESUME:%.*]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -1478,21 +1471,21 @@ int main() { // CHECK5-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 // CHECK5-NEXT: br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: [[TMP44:%.*]] = load i8, ptr [[A]], align 1 // CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP44]] to i32 // CHECK5-NEXT: [[CALL6:%.*]] = invoke noundef signext i32 @_Z5tmainIcLi5EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont5: // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK5-NEXT: [[CALL8:%.*]] = invoke noundef signext i32 @_Z5tmainI1SLi1EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont7: // CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK5-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK5-NEXT: [[TMP45:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK5-NEXT: ret i32 [[TMP45]] // CHECK5: eh.resume: @@ -1536,7 +1529,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1597,7 +1590,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1652,7 +1645,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -1669,21 +1662,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13:[0-9]+]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7:[0-9]+]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat { -// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] -// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR13]] +// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55 -// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 @@ -1699,7 +1692,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1764,7 +1757,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1819,7 +1812,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -1836,9 +1829,9 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // @@ -1879,7 +1872,7 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -1912,7 +1905,7 @@ int main() { // CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK5-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 @@ -1957,14 +1950,14 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK5-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: [[TMP15:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: [[TMP16:%.*]] = zext i8 [[TMP15]] to i32 @@ -1999,30 +1992,30 @@ int main() { // CHECK5-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 // CHECK5-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP33:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP34:%.*]] = extractvalue { ptr, i32 } [[TMP33]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -2036,14 +2029,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36 -// CHECK5-SAME: () #[[ATTR9:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2104,7 +2097,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2159,7 +2152,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2176,21 +2169,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40 -// CHECK5-SAME: () #[[ATTR10:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2251,7 +2244,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2306,7 +2299,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2323,21 +2316,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36 -// CHECK5-SAME: () #[[ATTR11:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2398,7 +2391,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2453,7 +2446,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2470,23 +2463,23 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40 -// CHECK5-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 // CHECK5-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_S:%.*]], align 8 // CHECK5-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK5-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK5-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store i8 [[TMP0]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 @@ -2495,14 +2488,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP2:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2567,7 +2560,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2622,7 +2615,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2639,24 +2632,17 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP11:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: ret void // -// -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR12:[0-9]+]] section ".text.startup" { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp index 2918c81d3416f..fe537dc743d4e 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp @@ -307,7 +307,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -398,7 +398,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -512,7 +512,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -612,7 +612,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -706,7 +706,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -875,13 +875,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1047,7 +1040,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1136,7 +1129,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1246,7 +1239,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1346,7 +1339,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1438,7 +1431,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1603,13 +1596,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1734,7 +1720,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[TMP:%.*]] = alloca ptr, align 8 @@ -1745,7 +1731,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1812,7 +1798,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR6]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1909,10 +1895,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp index a88a25e9f425a..cbd426aabb9c7 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp @@ -89,7 +89,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -122,7 +122,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -137,7 +137,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -198,7 +198,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -275,7 +275,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -336,7 +336,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -406,7 +406,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -440,7 +440,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -454,7 +454,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -515,7 +515,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -583,10 +583,3 @@ int main() { // CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]]) // CHECK1-NEXT: ret void // -// -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp index b7954f7214318..a003b9f203a47 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp @@ -145,7 +145,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i64 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i64 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -162,7 +162,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -228,8 +228,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -247,7 +247,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -327,8 +327,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -346,7 +346,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -366,7 +366,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -386,7 +386,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -439,7 +439,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -455,7 +455,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -521,8 +521,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -540,7 +540,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -620,8 +620,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -639,7 +639,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -659,7 +659,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -678,13 +678,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -737,7 +730,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i32 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i32 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -754,7 +747,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -818,8 +811,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -837,7 +830,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -915,8 +908,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -934,7 +927,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -954,7 +947,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -974,7 +967,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -1027,7 +1020,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret i32 0 @@ -1043,7 +1036,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1107,8 +1100,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1126,7 +1119,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1204,8 +1197,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1223,7 +1216,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1243,7 +1236,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1262,13 +1255,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1289,7 +1275,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1355,8 +1341,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK9-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1374,7 +1360,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1458,8 +1444,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR2]], ptr [[TMP15]], align 8 // CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1477,7 +1463,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1497,7 +1483,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1515,10 +1501,3 @@ int main() { // CHECK9-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp index 76a451ba8ef03..ab0e08259efe0 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp @@ -286,7 +286,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -328,7 +328,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -370,7 +370,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK1-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK1: omp_offload.failed16: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK1: omp_offload.cont17: // CHECK1-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -412,7 +412,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK1-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK1: omp_offload.failed24: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK1: omp_offload.cont25: // CHECK1-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -454,7 +454,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK1-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK1: omp_offload.failed32: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK1: omp_offload.cont33: // CHECK1-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -474,7 +474,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -537,7 +537,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -625,7 +625,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -688,7 +688,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -776,7 +776,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -839,7 +839,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -948,7 +948,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1011,7 +1011,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1097,7 +1097,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1160,7 +1160,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1235,13 +1235,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1320,7 +1313,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1362,7 +1355,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK3-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1404,7 +1397,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK3-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK3: omp_offload.failed16: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK3: omp_offload.cont17: // CHECK3-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1446,7 +1439,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK3-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK3: omp_offload.failed24: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK3: omp_offload.cont25: // CHECK3-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1488,7 +1481,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK3-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK3: omp_offload.failed32: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK3: omp_offload.cont33: // CHECK3-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1508,7 +1501,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1569,7 +1562,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1654,7 +1647,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1715,7 +1708,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1800,7 +1793,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1861,7 +1854,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1965,7 +1958,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2026,7 +2019,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2109,7 +2102,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2170,7 +2163,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2242,13 +2235,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2327,7 +2313,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK5-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2369,7 +2355,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK5-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK5: omp_offload.failed8: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK5: omp_offload.cont9: // CHECK5-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2411,7 +2397,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK5-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK5: omp_offload.failed16: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK5: omp_offload.cont17: // CHECK5-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2453,7 +2439,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK5-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK5: omp_offload.failed24: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK5: omp_offload.cont25: // CHECK5-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2495,7 +2481,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK5-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK5: omp_offload.failed32: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK5: omp_offload.cont33: // CHECK5-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2515,7 +2501,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2578,7 +2564,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2666,7 +2652,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2729,7 +2715,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2817,7 +2803,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2880,7 +2866,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2989,7 +2975,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3052,7 +3038,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3138,7 +3124,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3201,7 +3187,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3276,13 +3262,6 @@ int main (int argc, char **argv) { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK7-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK7-SAME: () #[[ATTR0:[0-9]+]] { // CHECK7-NEXT: entry: @@ -3361,7 +3340,7 @@ int main (int argc, char **argv) { // CHECK7-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK7-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK7: omp_offload.failed: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK7-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK7: omp_offload.cont: // CHECK7-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3403,7 +3382,7 @@ int main (int argc, char **argv) { // CHECK7-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK7-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK7: omp_offload.failed8: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]] +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]] // CHECK7-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK7: omp_offload.cont9: // CHECK7-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3445,7 +3424,7 @@ int main (int argc, char **argv) { // CHECK7-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK7-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK7: omp_offload.failed16: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR3]] +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45(ptr [[THIS1]]) #[[ATTR2]] // CHECK7-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK7: omp_offload.cont17: // CHECK7-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3487,7 +3466,7 @@ int main (int argc, char **argv) { // CHECK7-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK7-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK7: omp_offload.failed24: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR3]] +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51(ptr [[THIS1]]) #[[ATTR2]] // CHECK7-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK7: omp_offload.cont25: // CHECK7-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3529,7 +3508,7 @@ int main (int argc, char **argv) { // CHECK7-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK7-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK7: omp_offload.failed32: -// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR3]] +// CHECK7-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57(ptr [[THIS1]]) #[[ATTR2]] // CHECK7-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK7: omp_offload.cont33: // CHECK7-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3549,7 +3528,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3610,7 +3589,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l35.omp_outlined.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3695,7 +3674,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3756,7 +3735,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3841,7 +3820,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3902,7 +3881,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l45.omp_outlined.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4006,7 +3985,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4067,7 +4046,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l51.omp_outlined.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4150,7 +4129,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4211,7 +4190,7 @@ int main (int argc, char **argv) { // // // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l57.omp_outlined.omp_outlined -// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK7-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4283,13 +4262,6 @@ int main (int argc, char **argv) { // CHECK7-NEXT: ret void // // -// CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK7-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK7-NEXT: entry: -// CHECK7-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK7-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -4426,7 +4398,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK13-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -4497,7 +4469,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK13-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK13: omp_offload.failed16: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK13: omp_offload.cont17: // CHECK13-NEXT: [[TMP73:%.*]] = load i32, ptr [[M]], align 4 @@ -4577,7 +4549,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP112:%.*]] = icmp ne i32 [[TMP111]], 0 // CHECK13-NEXT: br i1 [[TMP112]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK13: omp_offload.failed31: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK13: omp_offload.cont32: // CHECK13-NEXT: [[TMP113:%.*]] = load i32, ptr [[N]], align 4 @@ -4648,7 +4620,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0 // CHECK13-NEXT: br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK13: omp_offload.failed46: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK13: omp_offload.cont47: // CHECK13-NEXT: [[TMP148:%.*]] = load i32, ptr [[M]], align 4 @@ -4728,7 +4700,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP187:%.*]] = icmp ne i32 [[TMP186]], 0 // CHECK13-NEXT: br i1 [[TMP187]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK13: omp_offload.failed62: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK13: omp_offload.cont63: // CHECK13-NEXT: [[TMP188:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -4756,7 +4728,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4847,7 +4819,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4967,7 +4939,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5058,7 +5030,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5187,7 +5159,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5309,7 +5281,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5431,7 +5403,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5522,7 +5494,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5647,7 +5619,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5744,7 +5716,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5848,7 +5820,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -5920,7 +5892,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK13-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -5961,7 +5933,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK13-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK13: omp_offload.failed6: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK13: omp_offload.cont7: // CHECK13-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -6011,7 +5983,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK13-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK13: omp_offload.failed13: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i64 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i64 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK13: omp_offload.cont14: // CHECK13-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -6052,7 +6024,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK13-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK13: omp_offload.failed20: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK13: omp_offload.cont21: // CHECK13-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -6102,7 +6074,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK13-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK13: omp_offload.failed28: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i64 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i64 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK13: omp_offload.cont29: // CHECK13-NEXT: ret i32 0 @@ -6119,7 +6091,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6182,7 +6154,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6269,7 +6241,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6332,7 +6304,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6428,7 +6400,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6497,7 +6469,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6608,7 +6580,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6671,7 +6643,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6765,7 +6737,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6834,7 +6806,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6911,13 +6883,6 @@ int main (int argc, char **argv) { // CHECK13-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK13-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK13-NEXT: ret void -// -// // CHECK15-LABEL: define {{[^@]+}}@main // CHECK15-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK15-NEXT: entry: @@ -7054,7 +7019,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK15-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK15: omp_offload.failed: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK15: omp_offload.cont: // CHECK15-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -7126,7 +7091,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK15-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK15: omp_offload.failed16: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK15: omp_offload.cont17: // CHECK15-NEXT: [[TMP74:%.*]] = load i32, ptr [[M]], align 4 @@ -7207,7 +7172,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0 // CHECK15-NEXT: br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK15: omp_offload.failed31: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK15: omp_offload.cont32: // CHECK15-NEXT: [[TMP115:%.*]] = load i32, ptr [[N]], align 4 @@ -7279,7 +7244,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP150:%.*]] = icmp ne i32 [[TMP149]], 0 // CHECK15-NEXT: br i1 [[TMP150]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK15: omp_offload.failed46: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK15: omp_offload.cont47: // CHECK15-NEXT: [[TMP151:%.*]] = load i32, ptr [[M]], align 4 @@ -7360,7 +7325,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP191:%.*]] = icmp ne i32 [[TMP190]], 0 // CHECK15-NEXT: br i1 [[TMP191]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK15: omp_offload.failed62: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK15: omp_offload.cont63: // CHECK15-NEXT: [[TMP192:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -7388,7 +7353,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -7477,7 +7442,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -7594,7 +7559,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -7683,7 +7648,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -7809,7 +7774,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -7929,7 +7894,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8048,7 +8013,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8137,7 +8102,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8259,7 +8224,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8354,7 +8319,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8455,7 +8420,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK15-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK15-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -8527,7 +8492,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK15-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK15: omp_offload.failed: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK15: omp_offload.cont: // CHECK15-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -8568,7 +8533,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK15-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK15: omp_offload.failed6: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK15: omp_offload.cont7: // CHECK15-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -8618,7 +8583,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK15-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK15: omp_offload.failed13: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i32 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i32 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK15: omp_offload.cont14: // CHECK15-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -8659,7 +8624,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK15-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK15: omp_offload.failed20: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK15: omp_offload.cont21: // CHECK15-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -8709,7 +8674,7 @@ int main (int argc, char **argv) { // CHECK15-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK15-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK15: omp_offload.failed28: -// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i32 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK15-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i32 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK15-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK15: omp_offload.cont29: // CHECK15-NEXT: ret i32 0 @@ -8726,7 +8691,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8787,7 +8752,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8871,7 +8836,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -8932,7 +8897,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9025,7 +8990,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9092,7 +9057,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9198,7 +9163,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9259,7 +9224,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9350,7 +9315,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9417,7 +9382,7 @@ int main (int argc, char **argv) { // // // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined.omp_outlined -// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -9491,13 +9456,6 @@ int main (int argc, char **argv) { // CHECK15-NEXT: ret void // // -// CHECK15-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK15-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK15-NEXT: entry: -// CHECK15-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK15-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@main // CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -9634,7 +9592,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK17-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -9705,7 +9663,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK17-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK17: omp_offload.failed16: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK17: omp_offload.cont17: // CHECK17-NEXT: [[TMP73:%.*]] = load i32, ptr [[M]], align 4 @@ -9785,7 +9743,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP112:%.*]] = icmp ne i32 [[TMP111]], 0 // CHECK17-NEXT: br i1 [[TMP112]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK17: omp_offload.failed31: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK17: omp_offload.cont32: // CHECK17-NEXT: [[TMP113:%.*]] = load i32, ptr [[N]], align 4 @@ -9856,7 +9814,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0 // CHECK17-NEXT: br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK17: omp_offload.failed46: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK17: omp_offload.cont47: // CHECK17-NEXT: [[TMP148:%.*]] = load i32, ptr [[M]], align 4 @@ -9936,7 +9894,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP187:%.*]] = icmp ne i32 [[TMP186]], 0 // CHECK17-NEXT: br i1 [[TMP187]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK17: omp_offload.failed62: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK17: omp_offload.cont63: // CHECK17-NEXT: [[TMP188:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -9964,7 +9922,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10055,7 +10013,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10175,7 +10133,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10266,7 +10224,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10395,7 +10353,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10517,7 +10475,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10639,7 +10597,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10730,7 +10688,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10855,7 +10813,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10952,7 +10910,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11056,7 +11014,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK17-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -11128,7 +11086,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK17-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -11169,7 +11127,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK17-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK17: omp_offload.failed6: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK17: omp_offload.cont7: // CHECK17-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -11219,7 +11177,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK17-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK17: omp_offload.failed13: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i64 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i64 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK17: omp_offload.cont14: // CHECK17-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -11260,7 +11218,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK17-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK17: omp_offload.failed20: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK17: omp_offload.cont21: // CHECK17-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -11310,7 +11268,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK17-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK17: omp_offload.failed28: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i64 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i64 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK17: omp_offload.cont29: // CHECK17-NEXT: ret i32 0 @@ -11327,7 +11285,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11390,7 +11348,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11477,7 +11435,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11540,7 +11498,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11636,7 +11594,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11705,7 +11663,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11816,7 +11774,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11879,7 +11837,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -11973,7 +11931,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -12042,7 +12000,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -12119,13 +12077,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@main // CHECK19-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -12262,7 +12213,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK19-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -12334,7 +12285,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK19-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK19: omp_offload.failed16: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK19: omp_offload.cont17: // CHECK19-NEXT: [[TMP74:%.*]] = load i32, ptr [[M]], align 4 @@ -12415,7 +12366,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0 // CHECK19-NEXT: br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK19: omp_offload.failed31: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK19: omp_offload.cont32: // CHECK19-NEXT: [[TMP115:%.*]] = load i32, ptr [[N]], align 4 @@ -12487,7 +12438,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP150:%.*]] = icmp ne i32 [[TMP149]], 0 // CHECK19-NEXT: br i1 [[TMP150]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK19: omp_offload.failed46: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK19: omp_offload.cont47: // CHECK19-NEXT: [[TMP151:%.*]] = load i32, ptr [[M]], align 4 @@ -12568,7 +12519,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP191:%.*]] = icmp ne i32 [[TMP190]], 0 // CHECK19-NEXT: br i1 [[TMP191]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK19: omp_offload.failed62: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK19: omp_offload.cont63: // CHECK19-NEXT: [[TMP192:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -12596,7 +12547,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12685,7 +12636,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l148.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12802,7 +12753,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12891,7 +12842,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l153.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13017,7 +12968,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13137,7 +13088,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l158.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13256,7 +13207,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13345,7 +13296,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l163.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13467,7 +13418,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13562,7 +13513,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l168.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13663,7 +13614,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK19-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -13735,7 +13686,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK19-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -13776,7 +13727,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK19-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK19: omp_offload.failed6: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121(ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK19: omp_offload.cont7: // CHECK19-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -13826,7 +13777,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK19-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK19: omp_offload.failed13: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i32 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126(i32 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK19: omp_offload.cont14: // CHECK19-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -13867,7 +13818,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK19-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK19: omp_offload.failed20: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131(ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK19: omp_offload.cont21: // CHECK19-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -13917,7 +13868,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK19-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK19: omp_offload.failed28: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i32 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136(i32 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK19: omp_offload.cont29: // CHECK19-NEXT: ret i32 0 @@ -13934,7 +13885,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13995,7 +13946,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14079,7 +14030,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14140,7 +14091,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l121.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14233,7 +14184,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14300,7 +14251,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l126.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14406,7 +14357,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14467,7 +14418,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l131.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14558,7 +14509,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14625,7 +14576,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l136.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14698,10 +14649,3 @@ int main (int argc, char **argv) { // CHECK19: omp.dispatch.end: // CHECK19-NEXT: ret void // -// -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp index f35f650792d38..96439c053ea89 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp @@ -602,7 +602,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36 -// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]], i64 noundef [[I:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]], i64 noundef [[I:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i64, align 8 @@ -852,13 +852,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1275,7 +1268,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36 -// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]], i32 noundef [[I:%.*]], i32 noundef [[N:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]], i32 noundef [[I:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -1520,13 +1513,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK5-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1896,7 +1882,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0 // CHECK9-NEXT: br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74(i64 [[TMP1]], ptr [[VLA]], i64 [[TMP4]], i64 [[TMP6]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74(i64 [[TMP1]], ptr [[VLA]], i64 [[TMP4]], i64 [[TMP6]]) #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 0 @@ -1924,7 +1910,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2031,7 +2017,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2164,13 +2150,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2272,7 +2251,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0 // CHECK11-NEXT: br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74(i32 [[TMP0]], ptr [[VLA]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74(i32 [[TMP0]], ptr [[VLA]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 0 @@ -2300,7 +2279,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2405,7 +2384,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l74.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2535,13 +2514,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2775,7 +2747,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0 // CHECK17-NEXT: br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112(ptr [[THIS1]], i64 [[TMP1]]) #[[ATTR4:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112(ptr [[THIS1]], i64 [[TMP1]]) #[[ATTR3:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2797,7 +2769,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2873,7 +2845,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2974,13 +2946,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -3050,7 +3015,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0 // CHECK19-NEXT: br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112(ptr [[THIS1]], i32 [[TMP1]]) #[[ATTR4:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112(ptr [[THIS1]], i32 [[TMP1]]) #[[ATTR3:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3072,7 +3037,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3146,7 +3111,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l112.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3244,13 +3209,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK21-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK21-SAME: () #[[ATTR0:[0-9]+]] { // CHECK21-NEXT: entry: @@ -3483,7 +3441,7 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0 // CHECK25-NEXT: br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166(i64 [[TMP1]], ptr [[VLA]], i64 [[TMP4]], i64 [[TMP6]]) #[[ATTR5:[0-9]+]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166(i64 [[TMP1]], ptr [[VLA]], i64 [[TMP4]], i64 [[TMP6]]) #[[ATTR4:[0-9]+]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3513,7 +3471,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3620,7 +3578,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3754,7 +3712,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR8:[0-9]+]] comdat { +// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR7:[0-9]+]] comdat { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK25-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -3830,14 +3788,14 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK25-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR5]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR4]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: ret i32 0 // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155 -// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[TE_ADDR:%.*]] = alloca i64, align 8 // CHECK25-NEXT: [[TH_ADDR:%.*]] = alloca i64, align 8 @@ -3855,7 +3813,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3925,7 +3883,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155.omp_outlined.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4008,13 +3966,6 @@ int main (int argc, char **argv) { // CHECK25-NEXT: ret void // // -// CHECK25-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK25-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK25-NEXT: entry: -// CHECK25-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK25-NEXT: ret void -// -// // CHECK27-LABEL: define {{[^@]+}}@main // CHECK27-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK27-NEXT: entry: @@ -4122,7 +4073,7 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0 // CHECK27-NEXT: br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166(i32 [[TMP0]], ptr [[VLA]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166(i32 [[TMP0]], ptr [[VLA]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -4152,7 +4103,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4257,7 +4208,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4388,7 +4339,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR8:[0-9]+]] comdat { +// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR7:[0-9]+]] comdat { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -4464,14 +4415,14 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK27-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR5]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR4]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: ret i32 0 // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155 -// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[TE_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[TH_ADDR:%.*]] = alloca i32, align 4 @@ -4489,7 +4440,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4557,7 +4508,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l155.omp_outlined.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4637,13 +4588,6 @@ int main (int argc, char **argv) { // CHECK27-NEXT: ret void // // -// CHECK27-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK27-SAME: () #[[ATTR9:[0-9]+]] { -// CHECK27-NEXT: entry: -// CHECK27-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK27-NEXT: ret void -// -// // CHECK29-LABEL: define {{[^@]+}}@main // CHECK29-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK29-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp index 518a6dc6d8e23..e68bd591519b7 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp @@ -166,7 +166,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -187,7 +187,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -260,7 +260,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -359,13 +359,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -425,7 +418,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -446,7 +439,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -517,7 +510,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -612,13 +605,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -888,7 +874,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -921,7 +907,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1051,7 +1037,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1225,7 +1211,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1274,7 +1260,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret i32 0 @@ -1291,7 +1277,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1364,7 +1350,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1462,13 +1448,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1597,7 +1576,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 // CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1630,7 +1609,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1762,7 +1741,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l86.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1936,7 +1915,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1985,7 +1964,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret i32 0 @@ -2002,7 +1981,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2073,7 +2052,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l72.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2167,13 +2146,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp index ab483ce94b6f7..c2226bc9e75d7 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp @@ -214,7 +214,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -256,7 +256,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -298,7 +298,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK1-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK1: omp_offload.failed16: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK1: omp_offload.cont17: // CHECK1-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -318,7 +318,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -388,7 +388,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -483,7 +483,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -553,7 +553,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -648,7 +648,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -738,7 +738,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -822,13 +822,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -897,7 +890,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -939,7 +932,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK3-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -981,7 +974,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK3-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK3: omp_offload.failed16: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK3: omp_offload.cont17: // CHECK3-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1001,7 +994,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1069,7 +1062,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1161,7 +1154,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1229,7 +1222,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1321,7 +1314,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1409,7 +1402,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1490,13 +1483,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1861,7 +1847,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -1932,7 +1918,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK9-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK9: omp_offload.failed16: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK9: omp_offload.cont17: // CHECK9-NEXT: [[TMP73:%.*]] = load i32, ptr [[M]], align 4 @@ -2012,7 +1998,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP112:%.*]] = icmp ne i32 [[TMP111]], 0 // CHECK9-NEXT: br i1 [[TMP112]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK9: omp_offload.failed31: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK9: omp_offload.cont32: // CHECK9-NEXT: [[TMP113:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2040,7 +2026,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2143,7 +2129,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2275,7 +2261,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2378,7 +2364,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2519,7 +2505,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2653,7 +2639,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2772,7 +2758,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2833,7 +2819,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -2874,7 +2860,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK9: omp_offload.failed6: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK9: omp_offload.cont7: // CHECK9-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -2924,7 +2910,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK9-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK9: omp_offload.failed13: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96(i64 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96(i64 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK9: omp_offload.cont14: // CHECK9-NEXT: ret i32 0 @@ -2941,7 +2927,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3011,7 +2997,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3105,7 +3091,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3175,7 +3161,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3278,7 +3264,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3375,7 +3361,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3460,13 +3446,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -3584,7 +3563,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -3656,7 +3635,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK11-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK11: omp_offload.failed16: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK11: omp_offload.cont17: // CHECK11-NEXT: [[TMP74:%.*]] = load i32, ptr [[M]], align 4 @@ -3737,7 +3716,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0 // CHECK11-NEXT: br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK11: omp_offload.failed31: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK11: omp_offload.cont32: // CHECK11-NEXT: [[TMP115:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3765,7 +3744,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3866,7 +3845,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3995,7 +3974,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4096,7 +4075,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l113.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4234,7 +4213,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4366,7 +4345,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l118.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4482,7 +4461,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -4543,7 +4522,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -4584,7 +4563,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK11: omp_offload.failed6: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK11: omp_offload.cont7: // CHECK11-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -4634,7 +4613,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK11-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK11: omp_offload.failed13: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96(i32 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96(i32 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK11: omp_offload.cont14: // CHECK11-NEXT: ret i32 0 @@ -4651,7 +4630,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4719,7 +4698,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l86.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4810,7 +4789,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4878,7 +4857,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l91.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4978,7 +4957,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -5073,7 +5052,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l96.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -5155,13 +5134,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp index c89fbc2b42f50..4c9299e6cb9d2 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp @@ -408,7 +408,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l101.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -570,7 +570,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l101.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -712,7 +712,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -911,7 +911,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1053,7 +1053,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1271,13 +1271,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1507,7 +1500,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l101.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1667,7 +1660,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l101.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1805,7 +1798,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2004,7 +1997,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2144,7 +2137,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2358,13 +2351,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2492,7 +2478,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR1]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2815,7 +2801,7 @@ int main() { // // // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK7-SAME: () #[[ATTR1]] comdat { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK7-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3135,7 +3121,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca i64, align 8 @@ -3163,7 +3149,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3251,7 +3237,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3357,13 +3343,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp index 6236b7ee57e99..275058b195e37 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp @@ -147,7 +147,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -180,7 +180,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret void @@ -194,7 +194,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -261,7 +261,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -345,7 +345,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -393,7 +393,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]] // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -417,7 +417,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -495,7 +495,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@main -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 @@ -540,7 +540,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -573,7 +573,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr @Arg, align 4 @@ -625,7 +625,7 @@ int main() { // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 @@ -641,7 +641,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -708,7 +708,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -793,7 +793,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -841,7 +841,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP32]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -865,7 +865,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -963,7 +963,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1020,7 +1020,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP38]] // CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP38]] // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP38]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP38]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP38]] // CHECK1-NEXT: br label [[OMP_IF_END]] // CHECK1: omp_if.end: @@ -1046,7 +1046,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1169,7 +1169,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -1202,7 +1202,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 @@ -1254,7 +1254,7 @@ int main() { // CHECK1-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK1-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: ret i32 0 @@ -1268,7 +1268,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1335,7 +1335,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1420,7 +1420,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1468,7 +1468,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]] // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP50]] // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP50]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP50]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]] // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -1492,7 +1492,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1590,7 +1590,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1647,7 +1647,7 @@ int main() { // CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP56]] // CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP56]] // CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]] -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP56]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP56]] // CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP56]] // CHECK1-NEXT: br label [[OMP_IF_END]] // CHECK1: omp_if.end: @@ -1673,7 +1673,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1750,13 +1750,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1794,7 +1787,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -1827,7 +1820,7 @@ int main() { // CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK3: omp_offload.failed3: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK3: omp_offload.cont4: // CHECK3-NEXT: ret void @@ -1841,7 +1834,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1908,7 +1901,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1992,7 +1985,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2040,7 +2033,7 @@ int main() { // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]] // CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: @@ -2064,7 +2057,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2142,7 +2135,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@main -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: () #[[ATTR3:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 @@ -2187,7 +2180,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -2220,7 +2213,7 @@ int main() { // CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK3: omp_offload.failed3: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK3: omp_offload.cont4: // CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr @Arg, align 4 @@ -2272,7 +2265,7 @@ int main() { // CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK3-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 @@ -2288,7 +2281,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2355,7 +2348,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2440,7 +2433,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2488,7 +2481,7 @@ int main() { // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: @@ -2512,7 +2505,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2610,7 +2603,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2679,7 +2672,7 @@ int main() { // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]] -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP15]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP35]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP15]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP35]] // CHECK3-NEXT: br label [[OMP_IF_END]] // CHECK3: omp_if.end: @@ -2719,7 +2712,7 @@ int main() { // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK3-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR15]], align 4 -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1(ptr [[TMP27]], ptr [[DOTBOUND_ZERO_ADDR15]], i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1(ptr [[TMP27]], ptr [[DOTBOUND_ZERO_ADDR15]], i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR2]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK3-NEXT: br label [[OMP_IF_END16]] // CHECK3: omp_if.end16: @@ -2747,7 +2740,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2874,7 +2867,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1 -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3046,7 +3039,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -3079,7 +3072,7 @@ int main() { // CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK3: omp_offload.failed3: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK3: omp_offload.cont4: // CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 @@ -3131,7 +3124,7 @@ int main() { // CHECK3-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK3-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: ret i32 0 @@ -3145,7 +3138,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3212,7 +3205,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3297,7 +3290,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3345,7 +3338,7 @@ int main() { // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: @@ -3369,7 +3362,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3467,7 +3460,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3524,7 +3517,7 @@ int main() { // CHECK3-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP55]] // CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP55]] // CHECK3-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]] -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP55]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP55]] // CHECK3-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP55]] // CHECK3-NEXT: br label [[OMP_IF_END]] // CHECK3: omp_if.end: @@ -3550,7 +3543,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3627,13 +3620,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -4262,7 +4248,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -4295,7 +4281,7 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: ret void @@ -4309,7 +4295,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4376,7 +4362,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4460,7 +4446,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4508,7 +4494,7 @@ int main() { // CHECK9-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]] // CHECK9-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]] // CHECK9-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK9: omp.inner.for.inc: @@ -4532,7 +4518,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4610,7 +4596,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@main -// CHECK9-SAME: () #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: () #[[ATTR3:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 @@ -4655,7 +4641,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -4688,7 +4674,7 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: [[TMP30:%.*]] = load i32, ptr @Arg, align 4 @@ -4740,7 +4726,7 @@ int main() { // CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK9-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK9: omp_offload.failed8: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK9: omp_offload.cont9: // CHECK9-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 @@ -4756,7 +4742,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4823,7 +4809,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4908,7 +4894,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -4956,7 +4942,7 @@ int main() { // CHECK9-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]] // CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP32]] // CHECK9-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]] -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP32]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP32]] // CHECK9-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]] // CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK9: omp.inner.for.inc: @@ -4980,7 +4966,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5078,7 +5064,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5135,7 +5121,7 @@ int main() { // CHECK9-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP38]] // CHECK9-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP38]] // CHECK9-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]] -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP38]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP38]] // CHECK9-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP38]] // CHECK9-NEXT: br label [[OMP_IF_END]] // CHECK9: omp_if.end: @@ -5161,7 +5147,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5284,7 +5270,7 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -5317,7 +5303,7 @@ int main() { // CHECK9-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK9-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK9: omp_offload.failed3: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK9: omp_offload.cont4: // CHECK9-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 @@ -5369,7 +5355,7 @@ int main() { // CHECK9-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK9-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK9: omp_offload.failed8: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR3]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK9: omp_offload.cont9: // CHECK9-NEXT: ret i32 0 @@ -5383,7 +5369,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5450,7 +5436,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5535,7 +5521,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5583,7 +5569,7 @@ int main() { // CHECK9-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]] // CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP50]] // CHECK9-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]] -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP50]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP50]] // CHECK9-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]] // CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK9: omp.inner.for.inc: @@ -5607,7 +5593,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5705,7 +5691,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5762,7 +5748,7 @@ int main() { // CHECK9-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP56]] // CHECK9-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP56]] // CHECK9-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]] -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP56]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP56]] // CHECK9-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP56]] // CHECK9-NEXT: br label [[OMP_IF_END]] // CHECK9: omp_if.end: @@ -5788,7 +5774,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5865,13 +5851,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -5909,7 +5888,7 @@ int main() { // CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK11-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -5942,7 +5921,7 @@ int main() { // CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK11-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK11: omp_offload.failed3: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK11: omp_offload.cont4: // CHECK11-NEXT: ret void @@ -5956,7 +5935,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6023,7 +6002,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6107,7 +6086,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6155,7 +6134,7 @@ int main() { // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK11-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]] // CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]] -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]] // CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK11: omp.inner.for.inc: @@ -6179,7 +6158,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6257,7 +6236,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@main -// CHECK11-SAME: () #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: () #[[ATTR3:[0-9]+]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 @@ -6302,7 +6281,7 @@ int main() { // CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK11-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -6335,7 +6314,7 @@ int main() { // CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK11-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK11: omp_offload.failed3: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84() #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK11: omp_offload.cont4: // CHECK11-NEXT: [[TMP30:%.*]] = load i32, ptr @Arg, align 4 @@ -6387,7 +6366,7 @@ int main() { // CHECK11-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK11-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK11: omp_offload.failed8: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92(i64 [[TMP31]]) #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK11: omp_offload.cont9: // CHECK11-NEXT: [[TMP56:%.*]] = load i32, ptr @Arg, align 4 @@ -6403,7 +6382,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6470,7 +6449,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6555,7 +6534,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6603,7 +6582,7 @@ int main() { // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK11-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK11: omp.inner.for.inc: @@ -6627,7 +6606,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l84.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6725,7 +6704,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6794,7 +6773,7 @@ int main() { // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]] -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP15]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP35]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined(ptr [[TMP15]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP35]] // CHECK11-NEXT: br label [[OMP_IF_END]] // CHECK11: omp_if.end: @@ -6834,7 +6813,7 @@ int main() { // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK11-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR15]], align 4 -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1(ptr [[TMP27]], ptr [[DOTBOUND_ZERO_ADDR15]], i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1(ptr [[TMP27]], ptr [[DOTBOUND_ZERO_ADDR15]], i64 [[TMP21]], i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR2]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK11-NEXT: br label [[OMP_IF_END16]] // CHECK11: omp_if.end16: @@ -6862,7 +6841,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6989,7 +6968,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined.omp_outlined.1 -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7161,7 +7140,7 @@ int main() { // CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK11-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57() #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -7194,7 +7173,7 @@ int main() { // CHECK11-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK11-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK11: omp_offload.failed3: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62() #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK11: omp_offload.cont4: // CHECK11-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4 @@ -7246,7 +7225,7 @@ int main() { // CHECK11-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK11-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK11: omp_offload.failed8: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67(i64 [[TMP31]]) #[[ATTR2]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK11: omp_offload.cont9: // CHECK11-NEXT: ret i32 0 @@ -7260,7 +7239,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7327,7 +7306,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l57.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7412,7 +7391,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7460,7 +7439,7 @@ int main() { // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK11-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4 -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]) // CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK11: omp.inner.for.inc: @@ -7484,7 +7463,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l62.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7582,7 +7561,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7639,7 +7618,7 @@ int main() { // CHECK11-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP55]] // CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP55]] // CHECK11-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]] -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP55]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP55]] // CHECK11-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP55]] // CHECK11-NEXT: br label [[OMP_IF_END]] // CHECK11: omp_if.end: @@ -7665,7 +7644,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l67.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7742,13 +7721,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@_Z9gtid_testv // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp index 384091135b0e5..fe2001842c268 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp @@ -221,7 +221,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -325,7 +325,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -453,13 +453,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -508,7 +501,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -610,7 +603,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -736,13 +729,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -882,7 +868,7 @@ int main() { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -893,11 +879,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP41]] // @@ -949,7 +935,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1079,14 +1065,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP29]], ptr [[TMP4]], align 4 // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done12: @@ -1094,7 +1080,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1243,14 +1229,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP30]], ptr [[TMP4]], align 4 // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0 // CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP31]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done16: @@ -1263,12 +1249,12 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1355,7 +1341,7 @@ int main() { // CHECK9-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK9-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -1365,11 +1351,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP36]] // @@ -1453,7 +1439,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1577,14 +1563,14 @@ int main() { // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP27]], i64 4, i1 false) // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done11: @@ -1592,7 +1578,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1735,14 +1721,14 @@ int main() { // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 [[TMP28]], i64 4, i1 false) // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0 // CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done15: @@ -1755,7 +1741,7 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -1793,13 +1779,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1903,7 +1882,7 @@ int main() { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -1914,11 +1893,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP41]] // @@ -1970,7 +1949,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2098,14 +2077,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP27]], ptr [[TMP4]], align 4 // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done12: @@ -2113,7 +2092,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2258,14 +2237,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP30]], ptr [[TMP4]], align 4 // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP31]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done14: @@ -2278,12 +2257,12 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2370,7 +2349,7 @@ int main() { // CHECK11-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK11-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -2380,11 +2359,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP36]] // @@ -2468,7 +2447,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2590,14 +2569,14 @@ int main() { // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP25]], i32 4, i1 false) // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done11: @@ -2605,7 +2584,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2744,14 +2723,14 @@ int main() { // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP6]], ptr align 4 [[TMP28]], i32 4, i1 false) // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done13: @@ -2764,7 +2743,7 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -2802,13 +2781,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2920,14 +2892,14 @@ int main() { // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) // CHECK13-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK13-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0 // CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done14: @@ -2939,11 +2911,11 @@ int main() { // CHECK13: arraydestroy.body17: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAY_BEGIN16]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17]] // CHECK13: arraydestroy.done21: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP20]] // @@ -2977,12 +2949,12 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3084,14 +3056,14 @@ int main() { // CHECK13: omp.arraycpy.done12: // CHECK13-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 8 // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i64 4, i1 false) -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0 // CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done14: @@ -3102,11 +3074,11 @@ int main() { // CHECK13: arraydestroy.body16: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK13: arraydestroy.done20: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP19]] // @@ -3174,7 +3146,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -3321,14 +3293,14 @@ int main() { // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) // CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK15-NEXT: store i32 [[TMP17]], ptr @_ZZ4mainE4svar, align 4 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0 // CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done13: @@ -3340,11 +3312,11 @@ int main() { // CHECK15: arraydestroy.body16: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP19]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK15: arraydestroy.done20: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP20:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP20]] // @@ -3378,12 +3350,12 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -3483,14 +3455,14 @@ int main() { // CHECK15: omp.arraycpy.done11: // CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4 // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP16]], i32 4, i1 false) -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0 // CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done13: @@ -3501,11 +3473,11 @@ int main() { // CHECK15: arraydestroy.body15: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAY_BEGIN14]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15]] // CHECK15: arraydestroy.done19: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP19]] // @@ -3573,7 +3545,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp index 7ff68628f10ef..49f57a000d3ff 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp @@ -95,7 +95,7 @@ int main() { // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK1-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -128,16 +128,16 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR4:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: lpad: // CHECK1-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: cleanup +// CHECK1-NEXT: cleanup // CHECK1-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK1-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK1-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK1-NEXT: br label [[EH_RESUME:%.*]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -186,21 +186,21 @@ int main() { // CHECK1-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 // CHECK1-NEXT: br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[TMP44:%.*]] = load i8, ptr [[A]], align 1 // CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP44]] to i32 // CHECK1-NEXT: [[CALL6:%.*]] = invoke noundef signext i32 @_Z5tmainIcLi5EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont5: // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK1-NEXT: [[CALL8:%.*]] = invoke noundef signext i32 @_Z5tmainI1SLi1EEiv() -// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK1-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK1: invoke.cont7: // CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK1-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK1-NEXT: [[TMP45:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK1-NEXT: ret i32 [[TMP45]] // CHECK1: eh.resume: @@ -244,7 +244,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -312,7 +312,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -367,7 +367,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -391,21 +391,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13:[0-9]+]], !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP13]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat { -// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] -// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR13]] +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK1-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] +// CHECK1-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55 -// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 @@ -421,7 +421,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -493,7 +493,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -548,7 +548,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -572,9 +572,9 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP21]] // CHECK1-NEXT: unreachable // // @@ -615,7 +615,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -648,7 +648,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 @@ -693,14 +693,14 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK1-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP15:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP16:%.*]] = zext i8 [[TMP15]] to i32 @@ -735,30 +735,30 @@ int main() { // CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 // CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR5]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR4]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: ret i32 0 // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP33:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP34:%.*]] = extractvalue { ptr, i32 } [[TMP33]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -772,7 +772,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -781,14 +781,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36 -// CHECK1-SAME: () #[[ATTR9:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -856,7 +856,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -911,7 +911,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -935,21 +935,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP27]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP27]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40 -// CHECK1-SAME: () #[[ATTR10:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1017,7 +1017,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1072,7 +1072,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1096,21 +1096,21 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP33]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP33]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36 -// CHECK1-SAME: () #[[ATTR11:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1178,7 +1178,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1233,7 +1233,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1257,23 +1257,23 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP39]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP39]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40 -// CHECK1-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 // CHECK1-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_S:%.*]], align 8 // CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK1-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK1: invoke.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK1-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK1-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK1-NEXT: store i8 [[TMP0]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 @@ -1282,14 +1282,14 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP2:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR13]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR7]] // CHECK1-NEXT: unreachable // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1361,7 +1361,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1416,7 +1416,7 @@ int main() { // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]] // CHECK1-NEXT: invoke void @_Z3foov() -// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] +// CHECK1-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] // CHECK1: invoke.cont: // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: @@ -1440,19 +1440,12 @@ int main() { // CHECK1-NEXT: ret void // CHECK1: terminate.lpad: // CHECK1-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK1-NEXT: catch ptr null +// CHECK1-NEXT: catch ptr null // CHECK1-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP45]] +// CHECK1-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP45]] // CHECK1-NEXT: unreachable // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR12:[0-9]+]] section ".text.startup" { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { // CHECK3-NEXT: entry: @@ -1475,7 +1468,7 @@ int main() { // CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK3-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK3-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK3: invoke.cont: // CHECK3-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 @@ -1494,7 +1487,7 @@ int main() { // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP2]] // CHECK3: invoke.cont1: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -1506,12 +1499,12 @@ int main() { // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3: lpad: // CHECK3-NEXT: [[TMP5:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: cleanup +// CHECK3-NEXT: cleanup // CHECK3-NEXT: [[TMP6:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 0 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[EXN_SLOT]], align 8 // CHECK3-NEXT: [[TMP7:%.*]] = extractvalue { ptr, i32 } [[TMP5]], 1 // CHECK3-NEXT: store i32 [[TMP7]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR6:[0-9]+]] +// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5:[0-9]+]] // CHECK3-NEXT: br label [[EH_RESUME:%.*]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: store i32 100, ptr [[I]], align 4 @@ -1533,7 +1526,7 @@ int main() { // CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 0, [[MUL11]] // CHECK3-NEXT: store i32 [[ADD12]], ptr [[I7]], align 4, !llvm.access.group [[ACC_GRP6]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP6]] +// CHECK3-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP6]] // CHECK3: invoke.cont13: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] // CHECK3: omp.body.continue14: @@ -1548,15 +1541,15 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = load i8, ptr [[A]], align 1 // CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP14]] to i32 // CHECK3-NEXT: [[CALL19:%.*]] = invoke noundef signext i32 @_Z5tmainIcLi5EEiv() -// CHECK3-NEXT: to label [[INVOKE_CONT18:%.*]] unwind label [[LPAD]] +// CHECK3-NEXT: to label [[INVOKE_CONT18:%.*]] unwind label [[LPAD]] // CHECK3: invoke.cont18: // CHECK3-NEXT: [[ADD20:%.*]] = add nsw i32 [[CONV]], [[CALL19]] // CHECK3-NEXT: [[CALL22:%.*]] = invoke noundef signext i32 @_Z5tmainI1SLi1EEiv() -// CHECK3-NEXT: to label [[INVOKE_CONT21:%.*]] unwind label [[LPAD]] +// CHECK3-NEXT: to label [[INVOKE_CONT21:%.*]] unwind label [[LPAD]] // CHECK3: invoke.cont21: // CHECK3-NEXT: [[ADD23:%.*]] = add nsw i32 [[ADD20]], [[CALL22]] // CHECK3-NEXT: store i32 [[ADD23]], ptr [[RETVAL]], align 4 -// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR6]] +// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK3-NEXT: ret i32 [[TMP15]] // CHECK3: eh.resume: @@ -1567,9 +1560,9 @@ int main() { // CHECK3-NEXT: resume { ptr, i32 } [[LPAD_VAL24]] // CHECK3: terminate.lpad: // CHECK3-NEXT: [[TMP16:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: catch ptr null +// CHECK3-NEXT: catch ptr null // CHECK3-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0 -// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP2]] +// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR6:[0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK3-NEXT: unreachable // // @@ -1600,8 +1593,8 @@ int main() { // // CHECK3-LABEL: define {{[^@]+}}@__clang_call_terminate // CHECK3-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] comdat { -// CHECK3-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR6]] -// CHECK3-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] +// CHECK3-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] +// CHECK3-NEXT: call void @_ZSt9terminatev() #[[ATTR6]] // CHECK3-NEXT: unreachable // // @@ -1634,7 +1627,7 @@ int main() { // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP9]] +// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP9]] // CHECK3: invoke.cont: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -1662,7 +1655,7 @@ int main() { // CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] // CHECK3-NEXT: store i32 [[ADD11]], ptr [[I6]], align 4, !llvm.access.group [[ACC_GRP12]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP12]] +// CHECK3-NEXT: to label [[INVOKE_CONT12:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP12]] // CHECK3: invoke.cont12: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE13:%.*]] // CHECK3: omp.body.continue13: @@ -1677,9 +1670,9 @@ int main() { // CHECK3-NEXT: ret i32 0 // CHECK3: terminate.lpad: // CHECK3-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: catch ptr null +// CHECK3-NEXT: catch ptr null // CHECK3-NEXT: [[TMP11:%.*]] = extractvalue { ptr, i32 } [[TMP10]], 0 -// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP9]] +// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR6]], !llvm.access.group [[ACC_GRP9]] // CHECK3-NEXT: unreachable // // @@ -1714,7 +1707,7 @@ int main() { // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP15]] +// CHECK3-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP15]] // CHECK3: invoke.cont: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: @@ -1727,10 +1720,10 @@ int main() { // CHECK3: omp.inner.for.end: // CHECK3-NEXT: store i32 100, ptr [[I]], align 4 // CHECK3-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK3-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] +// CHECK3-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[TERMINATE_LPAD]] // CHECK3: invoke.cont2: // CHECK3-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR6]] +// CHECK3-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] // CHECK3-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB4]], align 4 // CHECK3-NEXT: store i32 99, ptr [[DOTOMP_UB5]], align 4 @@ -1748,7 +1741,7 @@ int main() { // CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 0, [[MUL11]] // CHECK3-NEXT: store i32 [[ADD12]], ptr [[I7]], align 4, !llvm.access.group [[ACC_GRP18]] // CHECK3-NEXT: invoke void @_Z3foov() -// CHECK3-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: to label [[INVOKE_CONT13:%.*]] unwind label [[TERMINATE_LPAD]], !llvm.access.group [[ACC_GRP18]] // CHECK3: invoke.cont13: // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE14:%.*]] // CHECK3: omp.body.continue14: @@ -1763,24 +1756,24 @@ int main() { // CHECK3-NEXT: ret i32 0 // CHECK3: terminate.lpad: // CHECK3-NEXT: [[TMP10:%.*]] = landingpad { ptr, i32 } -// CHECK3-NEXT: catch ptr null +// CHECK3-NEXT: catch ptr null // CHECK3-NEXT: [[TMP11:%.*]] = extractvalue { ptr, i32 } [[TMP10]], 0 -// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP15]] +// CHECK3-NEXT: call void @__clang_call_terminate(ptr [[TMP11]]) #[[ATTR6]], !llvm.access.group [[ACC_GRP15]] // CHECK3-NEXT: unreachable // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR5:[0-9]+]] comdat { +// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK3-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR6]] +// CHECK3-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] // CHECK3-NEXT: ret void // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR5]] comdat { +// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -1794,7 +1787,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR5]] comdat { +// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -1822,7 +1815,7 @@ int main() { // CHECK5-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK5-NEXT: call void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[S]], i64 noundef 0) // CHECK5-NEXT: [[CALL:%.*]] = invoke noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: store i8 [[CALL]], ptr [[A]], align 1 // CHECK5-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 @@ -1855,16 +1848,16 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50() #[[ATTR4:[0-9]+]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: lpad: // CHECK5-NEXT: [[TMP15:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: cleanup +// CHECK5-NEXT: cleanup // CHECK5-NEXT: [[TMP16:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 0 // CHECK5-NEXT: store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8 // CHECK5-NEXT: [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1 // CHECK5-NEXT: store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK5-NEXT: br label [[EH_RESUME:%.*]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP18:%.*]] = load i8, ptr [[A]], align 1 @@ -1913,21 +1906,21 @@ int main() { // CHECK5-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0 // CHECK5-NEXT: br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55(i64 [[TMP19]]) #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: [[TMP44:%.*]] = load i8, ptr [[A]], align 1 // CHECK5-NEXT: [[CONV:%.*]] = sext i8 [[TMP44]] to i32 // CHECK5-NEXT: [[CALL6:%.*]] = invoke noundef signext i32 @_Z5tmainIcLi5EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT5:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont5: // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], [[CALL6]] // CHECK5-NEXT: [[CALL8:%.*]] = invoke noundef signext i32 @_Z5tmainI1SLi1EEiv() -// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] +// CHECK5-NEXT: to label [[INVOKE_CONT7:%.*]] unwind label [[LPAD]] // CHECK5: invoke.cont7: // CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]] // CHECK5-NEXT: store i32 [[ADD9]], ptr [[RETVAL]], align 4 -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]] // CHECK5-NEXT: [[TMP45:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK5-NEXT: ret i32 [[TMP45]] // CHECK5: eh.resume: @@ -1971,7 +1964,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2039,7 +2032,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l50.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2094,7 +2087,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP13]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2118,21 +2111,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13:[0-9]+]], !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7:[0-9]+]], !llvm.access.group [[ACC_GRP13]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@__clang_call_terminate -// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat { -// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]] -// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR13]] +// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK5-NEXT: [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]] +// CHECK5-NEXT: call void @_ZSt9terminatev() #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55 -// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 @@ -2148,7 +2141,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2220,7 +2213,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2275,7 +2268,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP21]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2299,9 +2292,9 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP21]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP21]] // CHECK5-NEXT: unreachable // // @@ -2342,7 +2335,7 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -2375,7 +2368,7 @@ int main() { // CHECK5-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK5-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 @@ -2420,14 +2413,14 @@ int main() { // CHECK5-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK5-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK5-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: [[TMP15:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: [[TMP16:%.*]] = zext i8 [[TMP15]] to i32 @@ -2462,30 +2455,30 @@ int main() { // CHECK5-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 // CHECK5-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK5: omp_offload.failed3: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR5]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40() #[[ATTR4]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK5: omp_offload.cont4: // CHECK5-NEXT: ret i32 0 // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP33:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP34:%.*]] = extractvalue { ptr, i32 } [[TMP33]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP34]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD1Ev -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]] // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SC2El -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 @@ -2499,14 +2492,14 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36 -// CHECK5-SAME: () #[[ATTR9:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2574,7 +2567,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l36.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2629,7 +2622,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP27]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2653,21 +2646,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP27]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP27]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40 -// CHECK5-SAME: () #[[ATTR10:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2735,7 +2728,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l40.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2790,7 +2783,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP33]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2814,21 +2807,21 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP33]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP33]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36 -// CHECK5-SAME: () #[[ATTR11:[0-9]+]] { +// CHECK5-SAME: () #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined) // CHECK5-NEXT: ret void // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2896,7 +2889,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l36.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2951,7 +2944,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP39]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -2975,23 +2968,23 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP39]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP39]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40 -// CHECK5-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1 // CHECK5-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_S:%.*]], align 8 // CHECK5-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 // CHECK5-NEXT: invoke void @_ZN1SC1El(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]], i64 noundef 23) -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]] // CHECK5: invoke.cont: // CHECK5-NEXT: [[CALL:%.*]] = call noundef signext i8 @_ZN1ScvcEv(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) -// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]] +// CHECK5-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]] // CHECK5-NEXT: store i8 [[CALL]], ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1 // CHECK5-NEXT: store i8 [[TMP0]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1 @@ -3000,14 +2993,14 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP2:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR13]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP3]]) #[[ATTR7]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3079,7 +3072,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l40.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -3134,7 +3127,7 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]] // CHECK5-NEXT: invoke void @_Z3foov() -// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] +// CHECK5-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[TERMINATE_LPAD:%.*]], !llvm.access.group [[ACC_GRP45]] // CHECK5: invoke.cont: // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: @@ -3158,24 +3151,17 @@ int main() { // CHECK5-NEXT: ret void // CHECK5: terminate.lpad: // CHECK5-NEXT: [[TMP13:%.*]] = landingpad { ptr, i32 } -// CHECK5-NEXT: catch ptr null +// CHECK5-NEXT: catch ptr null // CHECK5-NEXT: [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0 -// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR13]], !llvm.access.group [[ACC_GRP45]] +// CHECK5-NEXT: call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR7]], !llvm.access.group [[ACC_GRP45]] // CHECK5-NEXT: unreachable // // // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD2Ev -// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat { +// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK5-NEXT: ret void // -// -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR12:[0-9]+]] section ".text.startup" { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp index 8ad19077ed73c..7721daf4aa32b 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp @@ -905,13 +905,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1661,13 +1654,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2734,13 +2720,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp index 7d1d16109e4f1..2a3abf176929b 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp @@ -91,7 +91,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0 @@ -124,7 +124,7 @@ int main() { // CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 // CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]] // CHECK1: omp_offload.failed3: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]] // CHECK1: omp_offload.cont4: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -139,7 +139,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -207,7 +207,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -291,7 +291,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -359,7 +359,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l39.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -436,7 +436,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 @@ -470,7 +470,7 @@ int main() { // CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -484,7 +484,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -552,7 +552,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -628,13 +628,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp index 22671b48c3552..8745dd9710f64 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp @@ -149,7 +149,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70(i64 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70(i64 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -166,7 +166,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -239,8 +239,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -258,7 +258,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -345,8 +345,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -364,7 +364,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -384,7 +384,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -404,7 +404,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -457,7 +457,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -473,7 +473,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -546,8 +546,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -565,7 +565,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -652,8 +652,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP16]], align 8 // CHECK1-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -671,7 +671,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -691,7 +691,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -710,13 +710,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -769,7 +762,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70(i32 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70(i32 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -786,7 +779,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -857,8 +850,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -876,7 +869,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -961,8 +954,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP16]], align 4 // CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -980,7 +973,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1000,7 +993,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l70.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1020,7 +1013,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -1073,7 +1066,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret i32 0 @@ -1089,7 +1082,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1160,8 +1153,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1179,7 +1172,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1264,8 +1257,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP16]], align 4 // CHECK3-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1283,7 +1276,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1303,7 +1296,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1322,13 +1315,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1547,7 +1533,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1620,8 +1606,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP16]], align 8 // CHECK9-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP17]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1639,7 +1625,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1730,8 +1716,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR2]], ptr [[TMP17]], align 8 // CHECK9-NEXT: [[TMP18:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP18]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1749,7 +1735,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1769,7 +1755,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1788,13 +1774,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp index 8f49b4b4239aa..0c0945a23d482 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp @@ -296,7 +296,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -338,7 +338,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -380,7 +380,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK1-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK1: omp_offload.failed16: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK1: omp_offload.cont17: // CHECK1-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -422,7 +422,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK1-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK1: omp_offload.failed24: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK1: omp_offload.cont25: // CHECK1-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -464,7 +464,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK1-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK1: omp_offload.failed32: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK1: omp_offload.cont33: // CHECK1-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -484,7 +484,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -554,7 +554,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -649,7 +649,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -719,7 +719,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -814,7 +814,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -884,7 +884,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1000,7 +1000,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1070,7 +1070,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1163,7 +1163,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1233,7 +1233,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1315,13 +1315,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK2-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: @@ -1400,7 +1393,7 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK2-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK2: omp_offload.failed: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK2: omp_offload.cont: // CHECK2-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1442,7 +1435,7 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK2-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK2: omp_offload.failed8: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR3]] +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR2]] // CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK2: omp_offload.cont9: // CHECK2-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1484,7 +1477,7 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK2-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK2: omp_offload.failed16: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR3]] +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR2]] // CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK2: omp_offload.cont17: // CHECK2-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1526,7 +1519,7 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK2-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK2: omp_offload.failed24: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR3]] +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR2]] // CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK2: omp_offload.cont25: // CHECK2-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1568,7 +1561,7 @@ int main (int argc, char **argv) { // CHECK2-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK2-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK2: omp_offload.failed32: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR3]] +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR2]] // CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK2: omp_offload.cont33: // CHECK2-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -1588,7 +1581,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1658,7 +1651,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1753,7 +1746,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1823,7 +1816,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1918,7 +1911,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1988,7 +1981,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2104,7 +2097,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2174,7 +2167,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2267,7 +2260,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2337,7 +2330,7 @@ int main (int argc, char **argv) { // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined.omp_outlined -// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2419,13 +2412,6 @@ int main (int argc, char **argv) { // CHECK2-NEXT: ret void // // -// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK2-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK2-NEXT: entry: -// CHECK2-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK2-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2504,7 +2490,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK5-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK5: omp_offload.failed: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK5: omp_offload.cont: // CHECK5-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2546,7 +2532,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK5-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK5: omp_offload.failed8: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK5: omp_offload.cont9: // CHECK5-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2588,7 +2574,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK5-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK5: omp_offload.failed16: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK5: omp_offload.cont17: // CHECK5-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2630,7 +2616,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK5-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK5: omp_offload.failed24: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK5: omp_offload.cont25: // CHECK5-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2672,7 +2658,7 @@ int main (int argc, char **argv) { // CHECK5-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK5-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK5: omp_offload.failed32: -// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR3]] +// CHECK5-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR2]] // CHECK5-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK5: omp_offload.cont33: // CHECK5-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2692,7 +2678,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2760,7 +2746,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2852,7 +2838,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2920,7 +2906,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3012,7 +2998,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3080,7 +3066,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3191,7 +3177,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3259,7 +3245,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3349,7 +3335,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3417,7 +3403,7 @@ int main (int argc, char **argv) { // // // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined.omp_outlined -// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3496,13 +3482,6 @@ int main (int argc, char **argv) { // CHECK5-NEXT: ret void // // -// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK5-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK5-NEXT: entry: -// CHECK5-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK5-NEXT: ret void -// -// // CHECK6-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK6-SAME: () #[[ATTR0:[0-9]+]] { // CHECK6-NEXT: entry: @@ -3581,7 +3560,7 @@ int main (int argc, char **argv) { // CHECK6-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK6-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK6: omp_offload.failed: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK6-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK6: omp_offload.cont: // CHECK6-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3623,7 +3602,7 @@ int main (int argc, char **argv) { // CHECK6-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK6-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK6: omp_offload.failed8: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR3]] +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41(ptr [[THIS1]]) #[[ATTR2]] // CHECK6-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK6: omp_offload.cont9: // CHECK6-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3665,7 +3644,7 @@ int main (int argc, char **argv) { // CHECK6-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK6-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK6: omp_offload.failed16: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR3]] +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46(ptr [[THIS1]]) #[[ATTR2]] // CHECK6-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK6: omp_offload.cont17: // CHECK6-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3707,7 +3686,7 @@ int main (int argc, char **argv) { // CHECK6-NEXT: [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0 // CHECK6-NEXT: br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]] // CHECK6: omp_offload.failed24: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR3]] +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52(ptr [[THIS1]]) #[[ATTR2]] // CHECK6-NEXT: br label [[OMP_OFFLOAD_CONT25]] // CHECK6: omp_offload.cont25: // CHECK6-NEXT: [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3749,7 +3728,7 @@ int main (int argc, char **argv) { // CHECK6-NEXT: [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0 // CHECK6-NEXT: br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]] // CHECK6: omp_offload.failed32: -// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR3]] +// CHECK6-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58(ptr [[THIS1]]) #[[ATTR2]] // CHECK6-NEXT: br label [[OMP_OFFLOAD_CONT33]] // CHECK6: omp_offload.cont33: // CHECK6-NEXT: [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -3769,7 +3748,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3837,7 +3816,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3929,7 +3908,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3997,7 +3976,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l41.omp_outlined.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4089,7 +4068,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4157,7 +4136,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l46.omp_outlined.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4268,7 +4247,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4336,7 +4315,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l52.omp_outlined.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4426,7 +4405,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4494,7 +4473,7 @@ int main (int argc, char **argv) { // // // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l58.omp_outlined.omp_outlined -// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK6-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK6-NEXT: entry: // CHECK6-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK6-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -4573,13 +4552,6 @@ int main (int argc, char **argv) { // CHECK6-NEXT: ret void // // -// CHECK6-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK6-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK6-NEXT: entry: -// CHECK6-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK6-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -5101,7 +5073,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK13-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -5172,7 +5144,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK13-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK13: omp_offload.failed16: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK13: omp_offload.cont17: // CHECK13-NEXT: [[TMP73:%.*]] = load i32, ptr [[M]], align 4 @@ -5252,7 +5224,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP112:%.*]] = icmp ne i32 [[TMP111]], 0 // CHECK13-NEXT: br i1 [[TMP112]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK13: omp_offload.failed31: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK13: omp_offload.cont32: // CHECK13-NEXT: [[TMP113:%.*]] = load i32, ptr [[N]], align 4 @@ -5323,7 +5295,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0 // CHECK13-NEXT: br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK13: omp_offload.failed46: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK13: omp_offload.cont47: // CHECK13-NEXT: [[TMP148:%.*]] = load i32, ptr [[M]], align 4 @@ -5403,7 +5375,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP187:%.*]] = icmp ne i32 [[TMP186]], 0 // CHECK13-NEXT: br i1 [[TMP187]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK13: omp_offload.failed62: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK13: omp_offload.cont63: // CHECK13-NEXT: [[TMP188:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -5431,7 +5403,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5534,7 +5506,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5666,7 +5638,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5769,7 +5741,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -5910,7 +5882,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6044,7 +6016,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6178,7 +6150,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6281,7 +6253,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6418,7 +6390,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6527,7 +6499,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6643,7 +6615,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -6715,7 +6687,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK13-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK13: omp_offload.failed: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK13: omp_offload.cont: // CHECK13-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -6756,7 +6728,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK13-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK13: omp_offload.failed6: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK13: omp_offload.cont7: // CHECK13-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -6806,7 +6778,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK13-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK13: omp_offload.failed13: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i64 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i64 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK13: omp_offload.cont14: // CHECK13-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -6847,7 +6819,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK13-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK13: omp_offload.failed20: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK13: omp_offload.cont21: // CHECK13-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -6897,7 +6869,7 @@ int main (int argc, char **argv) { // CHECK13-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK13-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK13: omp_offload.failed28: -// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i64 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK13-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i64 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK13-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK13: omp_offload.cont29: // CHECK13-NEXT: ret i32 0 @@ -6914,7 +6886,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -6984,7 +6956,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7078,7 +7050,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7148,7 +7120,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7251,7 +7223,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7327,7 +7299,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7445,7 +7417,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7515,7 +7487,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7616,7 +7588,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7692,7 +7664,7 @@ int main (int argc, char **argv) { // // // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined.omp_outlined -// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -7776,13 +7748,6 @@ int main (int argc, char **argv) { // CHECK13-NEXT: ret void // // -// CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK13-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK13-NEXT: entry: -// CHECK13-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK13-NEXT: ret void -// -// // CHECK14-LABEL: define {{[^@]+}}@main // CHECK14-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK14-NEXT: entry: @@ -7919,7 +7884,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK14-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK14: omp_offload.failed: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK14: omp_offload.cont: // CHECK14-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -7990,7 +7955,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK14-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK14: omp_offload.failed16: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK14: omp_offload.cont17: // CHECK14-NEXT: [[TMP73:%.*]] = load i32, ptr [[M]], align 4 @@ -8070,7 +8035,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP112:%.*]] = icmp ne i32 [[TMP111]], 0 // CHECK14-NEXT: br i1 [[TMP112]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK14: omp_offload.failed31: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i64 [[TMP74]], i64 [[TMP76]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK14: omp_offload.cont32: // CHECK14-NEXT: [[TMP113:%.*]] = load i32, ptr [[N]], align 4 @@ -8141,7 +8106,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0 // CHECK14-NEXT: br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK14: omp_offload.failed46: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i64 [[TMP114]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK14: omp_offload.cont47: // CHECK14-NEXT: [[TMP148:%.*]] = load i32, ptr [[M]], align 4 @@ -8221,7 +8186,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP187:%.*]] = icmp ne i32 [[TMP186]], 0 // CHECK14-NEXT: br i1 [[TMP187]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK14: omp_offload.failed62: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i64 [[TMP149]], i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK14: omp_offload.cont63: // CHECK14-NEXT: [[TMP188:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -8249,7 +8214,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -8352,7 +8317,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -8484,7 +8449,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -8587,7 +8552,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -8728,7 +8693,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -8862,7 +8827,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -8996,7 +8961,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9099,7 +9064,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9236,7 +9201,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9345,7 +9310,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9461,7 +9426,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK14-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK14-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK14-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -9533,7 +9498,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK14-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK14: omp_offload.failed: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK14: omp_offload.cont: // CHECK14-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -9574,7 +9539,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK14-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK14: omp_offload.failed6: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK14: omp_offload.cont7: // CHECK14-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -9624,7 +9589,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK14-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK14: omp_offload.failed13: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i64 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i64 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK14: omp_offload.cont14: // CHECK14-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -9665,7 +9630,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK14-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK14: omp_offload.failed20: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK14: omp_offload.cont21: // CHECK14-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -9715,7 +9680,7 @@ int main (int argc, char **argv) { // CHECK14-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK14-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK14: omp_offload.failed28: -// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i64 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK14-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i64 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK14-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK14: omp_offload.cont29: // CHECK14-NEXT: ret i32 0 @@ -9732,7 +9697,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9802,7 +9767,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9896,7 +9861,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -9966,7 +9931,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10069,7 +10034,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10145,7 +10110,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10263,7 +10228,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10333,7 +10298,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10434,7 +10399,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10510,7 +10475,7 @@ int main (int argc, char **argv) { // // // CHECK14-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined.omp_outlined -// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK14-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK14-NEXT: entry: // CHECK14-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK14-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -10594,13 +10559,6 @@ int main (int argc, char **argv) { // CHECK14-NEXT: ret void // // -// CHECK14-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK14-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK14-NEXT: entry: -// CHECK14-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK14-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@main // CHECK17-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -10737,7 +10695,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK17-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -10809,7 +10767,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK17-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK17: omp_offload.failed16: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK17: omp_offload.cont17: // CHECK17-NEXT: [[TMP74:%.*]] = load i32, ptr [[M]], align 4 @@ -10890,7 +10848,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0 // CHECK17-NEXT: br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK17: omp_offload.failed31: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK17: omp_offload.cont32: // CHECK17-NEXT: [[TMP115:%.*]] = load i32, ptr [[N]], align 4 @@ -10962,7 +10920,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP150:%.*]] = icmp ne i32 [[TMP149]], 0 // CHECK17-NEXT: br i1 [[TMP150]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK17: omp_offload.failed46: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK17: omp_offload.cont47: // CHECK17-NEXT: [[TMP151:%.*]] = load i32, ptr [[M]], align 4 @@ -11043,7 +11001,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP191:%.*]] = icmp ne i32 [[TMP190]], 0 // CHECK17-NEXT: br i1 [[TMP191]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK17: omp_offload.failed62: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK17: omp_offload.cont63: // CHECK17-NEXT: [[TMP192:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -11071,7 +11029,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11172,7 +11130,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11301,7 +11259,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11402,7 +11360,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11540,7 +11498,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11672,7 +11630,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11803,7 +11761,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -11904,7 +11862,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12038,7 +11996,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12145,7 +12103,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12258,7 +12216,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK17-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK17-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK17-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -12330,7 +12288,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK17-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -12371,7 +12329,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK17-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK17: omp_offload.failed6: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK17: omp_offload.cont7: // CHECK17-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -12421,7 +12379,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK17-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK17: omp_offload.failed13: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i32 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i32 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK17: omp_offload.cont14: // CHECK17-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -12462,7 +12420,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK17-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK17: omp_offload.failed20: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK17: omp_offload.cont21: // CHECK17-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -12512,7 +12470,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK17-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK17: omp_offload.failed28: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i32 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i32 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK17: omp_offload.cont29: // CHECK17-NEXT: ret i32 0 @@ -12529,7 +12487,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12597,7 +12555,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12688,7 +12646,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12756,7 +12714,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12856,7 +12814,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -12930,7 +12888,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13043,7 +13001,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13111,7 +13069,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13209,7 +13167,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13283,7 +13241,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13364,13 +13322,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@main // CHECK19-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -13507,7 +13458,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK19-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -13579,7 +13530,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK19-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK19: omp_offload.failed16: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK19: omp_offload.cont17: // CHECK19-NEXT: [[TMP74:%.*]] = load i32, ptr [[M]], align 4 @@ -13660,7 +13611,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0 // CHECK19-NEXT: br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK19: omp_offload.failed31: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164(i32 [[TMP75]], i32 [[TMP77]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK19: omp_offload.cont32: // CHECK19-NEXT: [[TMP115:%.*]] = load i32, ptr [[N]], align 4 @@ -13732,7 +13683,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP150:%.*]] = icmp ne i32 [[TMP149]], 0 // CHECK19-NEXT: br i1 [[TMP150]], label [[OMP_OFFLOAD_FAILED46:%.*]], label [[OMP_OFFLOAD_CONT47:%.*]] // CHECK19: omp_offload.failed46: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169(i32 [[TMP116]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT47]] // CHECK19: omp_offload.cont47: // CHECK19-NEXT: [[TMP151:%.*]] = load i32, ptr [[M]], align 4 @@ -13813,7 +13764,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP191:%.*]] = icmp ne i32 [[TMP190]], 0 // CHECK19-NEXT: br i1 [[TMP191]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]] // CHECK19: omp_offload.failed62: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174(i32 [[TMP152]], i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT63]] // CHECK19: omp_offload.cont63: // CHECK19-NEXT: [[TMP192:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -13841,7 +13792,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -13942,7 +13893,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l154.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14071,7 +14022,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14172,7 +14123,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14310,7 +14261,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14442,7 +14393,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l164.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14573,7 +14524,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14674,7 +14625,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l169.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14808,7 +14759,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -14915,7 +14866,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l174.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15028,7 +14979,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK19-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -15100,7 +15051,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK19-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122(ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -15141,7 +15092,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK19-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK19: omp_offload.failed6: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127(ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK19: omp_offload.cont7: // CHECK19-NEXT: [[TMP40:%.*]] = load i32, ptr [[M]], align 4 @@ -15191,7 +15142,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0 // CHECK19-NEXT: br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK19: omp_offload.failed13: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i32 [[TMP41]], ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132(i32 [[TMP41]], ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK19: omp_offload.cont14: // CHECK19-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0 @@ -15232,7 +15183,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0 // CHECK19-NEXT: br i1 [[TMP84]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]] // CHECK19: omp_offload.failed20: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137(ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT21]] // CHECK19: omp_offload.cont21: // CHECK19-NEXT: [[TMP85:%.*]] = load i32, ptr [[M]], align 4 @@ -15282,7 +15233,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK19-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED28:%.*]], label [[OMP_OFFLOAD_CONT29:%.*]] // CHECK19: omp_offload.failed28: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i32 [[TMP86]], ptr [[A]]) #[[ATTR4]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142(i32 [[TMP86]], ptr [[A]]) #[[ATTR3]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT29]] // CHECK19: omp_offload.cont29: // CHECK19-NEXT: ret i32 0 @@ -15299,7 +15250,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15367,7 +15318,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l122.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15458,7 +15409,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15526,7 +15477,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l127.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15626,7 +15577,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15700,7 +15651,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l132.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15813,7 +15764,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15881,7 +15832,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l137.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -15979,7 +15930,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -16053,7 +16004,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l142.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -16134,13 +16085,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK21-LABEL: define {{[^@]+}}@main // CHECK21-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK21-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_private_codegen.cpp index 1ba0a36134d8c..bdc001fe5d5d8 100644 --- a/clang/test/OpenMP/teams_distribute_private_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_private_codegen.cpp @@ -287,7 +287,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -391,7 +391,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -491,7 +491,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -650,13 +650,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -822,7 +815,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -924,7 +917,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1024,7 +1017,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1181,13 +1174,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1312,7 +1298,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[TMP:%.*]] = alloca ptr, align 8 @@ -1323,7 +1309,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1410,10 +1396,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp index 3fbd791332a95..309cbffcf6162 100644 --- a/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp @@ -134,7 +134,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i64 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i64 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -151,7 +151,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -221,8 +221,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -241,7 +241,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -261,7 +261,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -314,7 +314,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -330,7 +330,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -400,8 +400,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 8 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -420,7 +420,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -439,13 +439,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -498,7 +491,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i32 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i32 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -515,7 +508,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -585,8 +578,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -605,7 +598,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -625,7 +618,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -678,7 +671,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret i32 0 @@ -694,7 +687,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -764,8 +757,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -784,7 +777,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -803,13 +796,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -830,7 +816,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -904,8 +890,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP13]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP14]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -924,7 +910,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -942,10 +928,3 @@ int main() { // CHECK9-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp index 751bb6ca35ba8..ee5d5cad72fe2 100644 --- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp @@ -645,13 +645,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1085,13 +1078,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK5-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1582,13 +1568,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1810,13 +1789,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2147,13 +2119,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -2331,13 +2296,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK21-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK21-SAME: () #[[ATTR0:[0-9]+]] { // CHECK21-NEXT: entry: @@ -2505,7 +2463,7 @@ int main (int argc, char **argv) { // CHECK21-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK21-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]] // CHECK21-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1 -// CHECK21-NEXT: [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal !5, !llvm.access.group [[ACC_GRP4]] +// CHECK21-NEXT: [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal [[META5:![0-9]+]], !llvm.access.group [[ACC_GRP4]] // CHECK21-NEXT: [[CONV:%.*]] = fptosi float [[TMP10]] to i32 // CHECK21-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[TMP0]], i32 0, i32 0 // CHECK21-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]] @@ -2566,13 +2524,6 @@ int main (int argc, char **argv) { // CHECK21-NEXT: ret void // // -// CHECK21-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK21-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK21-NEXT: entry: -// CHECK21-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK21-NEXT: ret void -// -// // CHECK23-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK23-SAME: () #[[ATTR0:[0-9]+]] { // CHECK23-NEXT: entry: @@ -2740,7 +2691,7 @@ int main (int argc, char **argv) { // CHECK23-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK23-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]] // CHECK23-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1 -// CHECK23-NEXT: [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal !6, !llvm.access.group [[ACC_GRP5]] +// CHECK23-NEXT: [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal [[META6:![0-9]+]], !llvm.access.group [[ACC_GRP5]] // CHECK23-NEXT: [[CONV:%.*]] = fptosi float [[TMP10]] to i32 // CHECK23-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[TMP0]], i32 0, i32 0 // CHECK23-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]] @@ -2799,13 +2750,6 @@ int main (int argc, char **argv) { // CHECK23-NEXT: ret void // // -// CHECK23-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK23-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK23-NEXT: entry: -// CHECK23-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK23-NEXT: ret void -// -// // CHECK25-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK25-SAME: () #[[ATTR0:[0-9]+]] { // CHECK25-NEXT: entry: @@ -2966,7 +2910,7 @@ int main (int argc, char **argv) { // CHECK29-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK29-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] // CHECK29-NEXT: [[B3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 -// CHECK29-NEXT: [[TMP6:%.*]] = load float, ptr [[B3]], align 4, !nontemporal !3, !llvm.access.group [[ACC_GRP2]] +// CHECK29-NEXT: [[TMP6:%.*]] = load float, ptr [[B3]], align 4, !nontemporal [[META3:![0-9]+]], !llvm.access.group [[ACC_GRP2]] // CHECK29-NEXT: [[CONV:%.*]] = fptosi float [[TMP6]] to i32 // CHECK29-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK29-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP2]] @@ -3066,7 +3010,7 @@ int main (int argc, char **argv) { // CHECK31-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK31-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]] // CHECK31-NEXT: [[B3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 1 -// CHECK31-NEXT: [[TMP6:%.*]] = load float, ptr [[B3]], align 4, !nontemporal !4, !llvm.access.group [[ACC_GRP3]] +// CHECK31-NEXT: [[TMP6:%.*]] = load float, ptr [[B3]], align 4, !nontemporal [[META4:![0-9]+]], !llvm.access.group [[ACC_GRP3]] // CHECK31-NEXT: [[CONV:%.*]] = fptosi float [[TMP6]] to i32 // CHECK31-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK31-NEXT: [[TMP7:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]] @@ -3526,13 +3470,6 @@ int main (int argc, char **argv) { // CHECK33-NEXT: ret void // // -// CHECK33-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK33-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK33-NEXT: entry: -// CHECK33-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK33-NEXT: ret void -// -// // CHECK35-LABEL: define {{[^@]+}}@main // CHECK35-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK35-NEXT: entry: @@ -3938,13 +3875,6 @@ int main (int argc, char **argv) { // CHECK35-NEXT: ret void // // -// CHECK35-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK35-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK35-NEXT: entry: -// CHECK35-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK35-NEXT: ret void -// -// // CHECK37-LABEL: define {{[^@]+}}@main // CHECK37-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK37-NEXT: entry: @@ -4409,13 +4339,6 @@ int main (int argc, char **argv) { // CHECK37-NEXT: ret void // // -// CHECK37-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK37-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK37-NEXT: entry: -// CHECK37-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK37-NEXT: ret void -// -// // CHECK39-LABEL: define {{[^@]+}}@main // CHECK39-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK39-NEXT: entry: @@ -4877,13 +4800,6 @@ int main (int argc, char **argv) { // CHECK39-NEXT: ret void // // -// CHECK39-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK39-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK39-NEXT: entry: -// CHECK39-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK39-NEXT: ret void -// -// // CHECK41-LABEL: define {{[^@]+}}@main // CHECK41-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK41-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp index 9562d2d973446..292bfb2a296c6 100644 --- a/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp @@ -157,7 +157,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -178,7 +178,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -267,13 +267,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -333,7 +326,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -354,7 +347,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -441,13 +434,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -717,7 +703,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -750,7 +736,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -916,7 +902,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -965,7 +951,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret i32 0 @@ -982,7 +968,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1070,13 +1056,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1205,7 +1184,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 // CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1238,7 +1217,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1402,7 +1381,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1451,7 +1430,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret i32 0 @@ -1468,7 +1447,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1554,13 +1533,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp index 0076e442d2c97..a18b4d2feeec6 100644 --- a/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp @@ -199,7 +199,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -241,7 +241,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK1-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK1: omp_offload.failed8: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK1: omp_offload.cont9: // CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -283,7 +283,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK1-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK1: omp_offload.failed16: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK1: omp_offload.cont17: // CHECK1-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -303,7 +303,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -388,7 +388,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -473,7 +473,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -564,13 +564,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -639,7 +632,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -681,7 +674,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK3-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]] // CHECK3: omp_offload.failed8: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT9]] // CHECK3: omp_offload.cont9: // CHECK3-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -723,7 +716,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK3-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK3: omp_offload.failed16: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38(ptr [[THIS1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK3: omp_offload.cont17: // CHECK3-NEXT: [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -743,7 +736,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -827,7 +820,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l33.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -911,7 +904,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l38.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1001,13 +994,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1369,7 +1355,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -1440,7 +1426,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0 // CHECK9-NEXT: br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK9: omp_offload.failed16: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK9: omp_offload.cont17: // CHECK9-NEXT: [[TMP73:%.*]] = load i32, ptr [[N]], align 4 @@ -1511,7 +1497,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP107:%.*]] = icmp ne i32 [[TMP106]], 0 // CHECK9-NEXT: br i1 [[TMP107]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK9: omp_offload.failed31: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i64 [[TMP74]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i64 [[TMP74]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK9: omp_offload.cont32: // CHECK9-NEXT: [[TMP108:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1539,7 +1525,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1661,7 +1647,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1790,7 +1776,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1917,7 +1903,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -1975,7 +1961,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -2016,7 +2002,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK9: omp_offload.failed6: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK9: omp_offload.cont7: // CHECK9-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0 @@ -2057,7 +2043,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK9-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK9: omp_offload.failed13: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK9: omp_offload.cont14: // CHECK9-NEXT: ret i32 0 @@ -2074,7 +2060,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2158,7 +2144,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2242,7 +2228,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2332,13 +2318,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -2453,7 +2432,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP38:%.*]] = load i32, ptr [[N]], align 4 @@ -2525,7 +2504,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0 // CHECK11-NEXT: br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]] // CHECK11: omp_offload.failed16: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT17]] // CHECK11: omp_offload.cont17: // CHECK11-NEXT: [[TMP74:%.*]] = load i32, ptr [[N]], align 4 @@ -2597,7 +2576,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP109:%.*]] = icmp ne i32 [[TMP108]], 0 // CHECK11-NEXT: br i1 [[TMP109]], label [[OMP_OFFLOAD_FAILED31:%.*]], label [[OMP_OFFLOAD_CONT32:%.*]] // CHECK11: omp_offload.failed31: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i32 [[TMP75]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110(i32 [[TMP75]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT32]] // CHECK11: omp_offload.cont32: // CHECK11-NEXT: [[TMP110:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2625,7 +2604,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2746,7 +2725,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2874,7 +2853,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l110.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3000,7 +2979,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -3058,7 +3037,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0 @@ -3099,7 +3078,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK11: omp_offload.failed6: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK11: omp_offload.cont7: // CHECK11-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0 @@ -3140,7 +3119,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0 // CHECK11-NEXT: br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]] // CHECK11: omp_offload.failed13: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT14]] // CHECK11: omp_offload.cont14: // CHECK11-NEXT: ret i32 0 @@ -3157,7 +3136,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l79.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3240,7 +3219,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3323,7 +3302,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3412,13 +3391,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp index b8fd9c8fcfd5b..dd3daf865512a 100644 --- a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp @@ -371,7 +371,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -538,7 +538,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -737,7 +737,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -960,13 +960,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1196,7 +1189,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1361,7 +1354,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1560,7 +1553,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1781,13 +1774,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1915,7 +1901,7 @@ int main() { // // // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK5-SAME: () #[[ATTR1]] comdat { // CHECK5-NEXT: entry: // CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK5-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2237,7 +2223,7 @@ int main() { // // // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat { +// CHECK7-SAME: () #[[ATTR1]] comdat { // CHECK7-NEXT: entry: // CHECK7-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK7-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2556,7 +2542,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G:%.*]], i64 noundef [[SIVAR:%.*]], i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca i64, align 8 @@ -2584,7 +2570,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2680,13 +2666,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp index 5a44016a203f5..073960cc5a460 100644 --- a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp @@ -174,7 +174,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -292,13 +292,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -347,7 +340,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -465,13 +458,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -611,7 +597,7 @@ int main() { // CHECK9-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK9-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -622,11 +608,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP41]] // @@ -678,7 +664,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -817,14 +803,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP28]], ptr [[TMP4]], align 4 // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done15: @@ -837,12 +823,12 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -929,7 +915,7 @@ int main() { // CHECK9-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK9-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -939,11 +925,11 @@ int main() { // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done2: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP36]] // @@ -1027,7 +1013,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1160,14 +1146,14 @@ int main() { // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP26]], i64 4, i1 false) // CHECK9-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK9: .omp.lastprivate.done: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done14: @@ -1180,7 +1166,7 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -1218,13 +1204,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1328,7 +1307,7 @@ int main() { // CHECK11-NEXT: [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0 // CHECK11-NEXT: br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -1339,11 +1318,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP41]] // @@ -1395,7 +1374,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1532,14 +1511,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP28]], ptr [[TMP4]], align 4 // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done14: @@ -1552,12 +1531,12 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1644,7 +1623,7 @@ int main() { // CHECK11-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 // CHECK11-NEXT: br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -1654,11 +1633,11 @@ int main() { // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done2: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP36]] // @@ -1742,7 +1721,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1873,14 +1852,14 @@ int main() { // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP26]], i32 4, i1 false) // CHECK11-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // CHECK11: .omp.lastprivate.done: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0 // CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done13: @@ -1893,7 +1872,7 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -1931,13 +1910,6 @@ int main() { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK13-LABEL: define {{[^@]+}}@main // CHECK13-SAME: () #[[ATTR0:[0-9]+]] { // CHECK13-NEXT: entry: @@ -2048,14 +2020,14 @@ int main() { // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i64 4, i1 false) // CHECK13-NEXT: [[TMP16:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK13-NEXT: store i32 [[TMP16]], ptr @_ZZ4mainE4svar, align 4 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4:[0-9]+]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3:[0-9]+]] // CHECK13-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0 // CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done14: @@ -2067,11 +2039,11 @@ int main() { // CHECK13: arraydestroy.body17: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST18:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT19:%.*]], [[ARRAYDESTROY_BODY17]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT19]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST18]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT19]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE20:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT19]], [[ARRAY_BEGIN16]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE20]], label [[ARRAYDESTROY_DONE21:%.*]], label [[ARRAYDESTROY_BODY17]] // CHECK13: arraydestroy.done21: -// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP19]] // @@ -2105,12 +2077,12 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK13-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK13-SAME: () #[[ATTR1]] comdat { // CHECK13-NEXT: entry: // CHECK13-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK13-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2211,14 +2183,14 @@ int main() { // CHECK13: omp.arraycpy.done12: // CHECK13-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8 // CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i64 4, i1 false) -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0 // CHECK13-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2 // CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK13: arraydestroy.body: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP16]], [[OMP_ARRAYCPY_DONE12]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK13: arraydestroy.done14: @@ -2229,11 +2201,11 @@ int main() { // CHECK13: arraydestroy.body16: // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP17]], [[ARRAYDESTROY_DONE14]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i64 -1 -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK13-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK13: arraydestroy.done20: -// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK13-NEXT: [[TMP18:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK13-NEXT: ret i32 [[TMP18]] // @@ -2301,7 +2273,7 @@ int main() { // CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK13-NEXT: ret void // // @@ -2447,14 +2419,14 @@ int main() { // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i32 4, i1 false) // CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[SVAR]], align 4 // CHECK15-NEXT: store i32 [[TMP16]], ptr @_ZZ4mainE4svar, align 4 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4:[0-9]+]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3:[0-9]+]] // CHECK15-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0 // CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done13: @@ -2466,11 +2438,11 @@ int main() { // CHECK15: arraydestroy.body16: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST17:%.*]] = phi ptr [ [[TMP18]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT18:%.*]], [[ARRAYDESTROY_BODY16]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT18]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST17]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT18]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE19:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT18]], [[ARRAY_BEGIN15]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE19]], label [[ARRAYDESTROY_DONE20:%.*]], label [[ARRAYDESTROY_BODY16]] // CHECK15: arraydestroy.done20: -// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP19:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP19]] // @@ -2504,12 +2476,12 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK15-SAME: () #[[ATTR3:[0-9]+]] comdat { +// CHECK15-SAME: () #[[ATTR1]] comdat { // CHECK15-NEXT: entry: // CHECK15-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK15-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -2608,14 +2580,14 @@ int main() { // CHECK15: omp.arraycpy.done11: // CHECK15-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4 // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP4]], ptr align 4 [[TMP15]], i32 4, i1 false) -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0 // CHECK15-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2 // CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK15: arraydestroy.body: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP16]], [[OMP_ARRAYCPY_DONE11]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK15: arraydestroy.done13: @@ -2626,11 +2598,11 @@ int main() { // CHECK15: arraydestroy.body15: // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST16:%.*]] = phi ptr [ [[TMP17]], [[ARRAYDESTROY_DONE13]] ], [ [[ARRAYDESTROY_ELEMENT17:%.*]], [[ARRAYDESTROY_BODY15]] ] // CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT17]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST16]], i32 -1 -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT17]]) #[[ATTR3]] // CHECK15-NEXT: [[ARRAYDESTROY_DONE18:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT17]], [[ARRAY_BEGIN14]] // CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE18]], label [[ARRAYDESTROY_DONE19:%.*]], label [[ARRAYDESTROY_BODY15]] // CHECK15: arraydestroy.done19: -// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR3]] // CHECK15-NEXT: [[TMP18:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK15-NEXT: ret i32 [[TMP18]] // @@ -2698,7 +2670,7 @@ int main() { // CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] +// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR3]] // CHECK15-NEXT: ret void // // diff --git a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp index 1e629933d7398..52b213c345915 100644 --- a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp @@ -665,13 +665,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1210,13 +1203,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -2199,13 +2185,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp index 1bf35ad2772cb..e660009f1cd7f 100644 --- a/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp @@ -134,7 +134,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i64 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i64 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -151,7 +151,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -228,8 +228,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -248,7 +248,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -268,7 +268,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -321,7 +321,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -337,7 +337,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -414,8 +414,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -434,7 +434,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -453,13 +453,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -512,7 +505,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i32 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63(i32 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -529,7 +522,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -606,8 +599,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -626,7 +619,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l63.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -646,7 +639,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -699,7 +692,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret i32 0 @@ -715,7 +708,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -792,8 +785,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -812,7 +805,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -831,13 +824,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK5-LABEL: define {{[^@]+}}@main // CHECK5-SAME: () #[[ATTR0:[0-9]+]] { // CHECK5-NEXT: entry: @@ -1056,7 +1042,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1137,8 +1123,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP15]], align 8 // CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1157,7 +1143,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1176,13 +1162,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp index dbdd5f912bb6a..c023de8cf010d 100644 --- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp @@ -187,7 +187,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91.omp_outlined -// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[G:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -208,13 +208,6 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -245,7 +238,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91.omp_outlined -// CHECK3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[G:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -266,13 +259,6 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -370,7 +356,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0 // CHECK9-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l109(i64 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], i64 [[TMP3]]) #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l109(i64 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], i64 [[TMP3]]) #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP36:%.*]] = load i32, ptr [[T_VAR]], align 4 @@ -414,23 +400,23 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 // CHECK9-NEXT: br i1 [[TMP57]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK9: omp_offload.failed6: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i64 [[TMP37]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i64 [[TMP37]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK9: omp_offload.cont7: // CHECK9-NEXT: [[CALL:%.*]] = call signext i32 @_Z5tmainIiET_v() // CHECK9-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK9-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP58]], [[OMP_OFFLOAD_CONT7]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done8: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP59:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP59]] // @@ -487,7 +473,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l109.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 [[SIVAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -521,7 +507,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ] // CHECK9-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) // CHECK9-NEXT: call void @_ZN1SIfEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]]) -// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]] // CHECK9-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1 // CHECK9-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1 // CHECK9-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] @@ -529,21 +515,21 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9: omp.arraycpy.done3: // CHECK9-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) // CHECK9-NEXT: call void @_ZN1SIfEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]]) -// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i64 0, i64 0 // CHECK9-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX]], align 4 // CHECK9-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i64 0, i64 0 // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX6]], ptr align 4 [[VAR4]], i64 4, i1 false) // CHECK9-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK9-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done8: @@ -581,7 +567,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN2StD2Ev(ptr nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN2StD2Ev(ptr nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -591,7 +577,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -609,7 +595,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[T_VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[T_VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -621,7 +607,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -706,7 +692,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 // CHECK9-NEXT: br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i64 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i64 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP31:%.*]] = load i32, ptr [[T_VAR]], align 128 @@ -750,22 +736,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 // CHECK9-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK9: omp_offload.failed6: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(i64 [[TMP32]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(i64 [[TMP32]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK9: omp_offload.cont7: // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK9-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP53]], [[OMP_OFFLOAD_CONT7]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done8: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP54:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP54]] // @@ -899,7 +885,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -931,7 +917,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ] // CHECK9-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) // CHECK9-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]]) -// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]] // CHECK9-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1 // CHECK9-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1 // CHECK9-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] @@ -939,20 +925,20 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9: omp.arraycpy.done3: // CHECK9-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) // CHECK9-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]]) -// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i64 0, i64 0 // CHECK9-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX]], align 128 // CHECK9-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i64 0, i64 0 // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[ARRAYIDX6]], ptr align 128 [[VAR4]], i64 4, i1 false) -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK9-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done8: @@ -980,7 +966,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -998,7 +984,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined -// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[T_VAR:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[T_VAR:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1067,13 +1053,6 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1171,7 +1150,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0 // CHECK11-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l109(i32 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], i32 [[TMP3]]) #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l109(i32 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]], i32 [[TMP3]]) #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP36:%.*]] = load i32, ptr [[T_VAR]], align 4 @@ -1215,23 +1194,23 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 // CHECK11-NEXT: br i1 [[TMP57]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK11: omp_offload.failed6: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i32 [[TMP37]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116(i32 [[TMP37]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK11: omp_offload.cont7: // CHECK11-NEXT: [[CALL:%.*]] = call i32 @_Z5tmainIiET_v() // CHECK11-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP58]], [[OMP_OFFLOAD_CONT7]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done8: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP59:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP59]] // @@ -1288,7 +1267,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l109.omp_outlined -// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 [[SIVAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1322,7 +1301,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ] // CHECK11-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) // CHECK11-NEXT: call void @_ZN1SIfEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]]) -// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]] // CHECK11-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1 // CHECK11-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1 // CHECK11-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] @@ -1330,21 +1309,21 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11: omp.arraycpy.done3: // CHECK11-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) // CHECK11-NEXT: call void @_ZN1SIfEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]]) -// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i32 0, i32 0 // CHECK11-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX]], align 4 // CHECK11-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX6]], ptr align 4 [[VAR4]], i32 4, i1 false) // CHECK11-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK11-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done8: @@ -1382,7 +1361,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN2StD2Ev(ptr nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN2StD2Ev(ptr nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -1392,7 +1371,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -1410,7 +1389,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined -// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[T_VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[T_VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1422,7 +1401,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1507,7 +1486,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 // CHECK11-NEXT: br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i32 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i32 [[TMP1]], ptr [[VEC]], ptr [[S_ARR]], ptr [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP31:%.*]] = load i32, ptr [[T_VAR]], align 128 @@ -1551,22 +1530,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 // CHECK11-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]] // CHECK11: omp_offload.failed6: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(i32 [[TMP32]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81(i32 [[TMP32]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT7]] // CHECK11: omp_offload.cont7: // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP53]], [[OMP_OFFLOAD_CONT7]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done8: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP54:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP54]] // @@ -1700,7 +1679,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.omp_outlined -// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 [[T_VAR:%.*]], ptr nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1732,7 +1711,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ] // CHECK11-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) // CHECK11-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr [[AGG_TMP]]) -// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]] // CHECK11-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1 // CHECK11-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1 // CHECK11-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] @@ -1740,20 +1719,20 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11: omp.arraycpy.done3: // CHECK11-NEXT: call void @_ZN2StC1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) // CHECK11-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr nonnull align 4 dereferenceable(4) [[VAR4]], ptr nonnull align 4 dereferenceable(4) [[TMP2]], ptr [[AGG_TMP5]]) -// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN2StD1Ev(ptr nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC1]], i32 0, i32 0 // CHECK11-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX]], align 128 // CHECK11-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[ARRAYIDX6]], ptr align 128 [[VAR4]], i32 4, i1 false) -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK11-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[OMP_ARRAYCPY_DONE3]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done8: @@ -1781,7 +1760,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -1799,7 +1778,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined -// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[T_VAR:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[T_VAR:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1868,13 +1847,6 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@_Z10array_funcPfP2StiPg // CHECK17-SAME: (ptr [[A:%.*]], ptr [[S:%.*]], i32 signext [[N:%.*]], ptr [[VLA1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -1999,7 +1971,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0 // CHECK17-NEXT: br i1 [[TMP57]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z10array_funcPfP2StiPg_l152(ptr [[TMP8]], ptr [[TMP9]], i64 [[TMP1]], ptr [[TMP10]], i64 [[TMP3]], i64 [[TMP5]], ptr [[VLA]], i64 [[TMP12]]) #[[ATTR5:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z10array_funcPfP2StiPg_l152(ptr [[TMP8]], ptr [[TMP9]], i64 [[TMP1]], ptr [[TMP10]], i64 [[TMP3]], i64 [[TMP5]], ptr [[VLA]], i64 [[TMP12]]) #[[ATTR4:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[TMP58:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8 @@ -2038,7 +2010,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z10array_funcPfP2StiPg_l152.omp_outlined -// CHECK17-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr [[S:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], i64 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[A:%.*]], i64 [[VLA2:%.*]], i64 [[VLA4:%.*]], ptr nonnull align 8 dereferenceable(8) [[VLA26:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK17-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr [[S:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], i64 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[A:%.*]], i64 [[VLA2:%.*]], i64 [[VLA4:%.*]], ptr nonnull align 8 dereferenceable(8) [[VLA26:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2237,7 +2209,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: [[TMP69:%.*]] = icmp ne i32 [[TMP68]], 0 // CHECK17-NEXT: br i1 [[TMP69]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2St7St_funcEPS_iPg_l144(ptr [[TMP9]], i64 [[TMP1]], ptr [[TMP10]], i64 [[TMP3]], i64 [[TMP5]], ptr [[VLA]], ptr [[THIS1]], i64 [[TMP12]]) #[[ATTR5]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2St7St_funcEPS_iPg_l144(ptr [[TMP9]], i64 [[TMP1]], ptr [[TMP10]], i64 [[TMP3]], i64 [[TMP5]], ptr [[VLA]], ptr [[THIS1]], i64 [[TMP12]]) #[[ATTR4]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[TMP70:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8 @@ -2276,7 +2248,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2St7St_funcEPS_iPg_l144.omp_outlined -// CHECK17-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[THIS:%.*]], i64 [[VLA2:%.*]], i64 [[VLA4:%.*]], ptr nonnull align 8 dereferenceable(8) [[VLA26:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], ptr [[S:%.*]]) #[[ATTR3]] { +// CHECK17-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[THIS:%.*]], i64 [[VLA2:%.*]], i64 [[VLA4:%.*]], ptr nonnull align 8 dereferenceable(8) [[VLA26:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], ptr [[S:%.*]]) #[[ATTR2]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2340,13 +2312,6 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z10array_funcPfP2StiPe // CHECK19-SAME: (ptr [[A:%.*]], ptr [[S:%.*]], i32 [[N:%.*]], ptr [[VLA1:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -2469,7 +2434,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: [[TMP55:%.*]] = icmp ne i32 [[TMP54]], 0 // CHECK19-NEXT: br i1 [[TMP55]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z10array_funcPfP2StiPe_l152(ptr [[TMP5]], ptr [[TMP6]], i32 [[TMP0]], ptr [[TMP7]], i32 [[TMP1]], i32 [[TMP2]], ptr [[VLA]], i32 [[TMP9]]) #[[ATTR5:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z10array_funcPfP2StiPe_l152(ptr [[TMP5]], ptr [[TMP6]], i32 [[TMP0]], ptr [[TMP7]], i32 [[TMP1]], i32 [[TMP2]], ptr [[VLA]], i32 [[TMP9]]) #[[ATTR4:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[TMP56:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4 @@ -2508,7 +2473,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z10array_funcPfP2StiPe_l152.omp_outlined -// CHECK19-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr [[S:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], i32 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[A:%.*]], i32 [[VLA2:%.*]], i32 [[VLA4:%.*]], ptr nonnull align 4 dereferenceable(8) [[VLA26:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK19-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr [[S:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], i32 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[A:%.*]], i32 [[VLA2:%.*]], i32 [[VLA4:%.*]], ptr nonnull align 4 dereferenceable(8) [[VLA26:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2705,7 +2670,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: [[TMP67:%.*]] = icmp ne i32 [[TMP66]], 0 // CHECK19-NEXT: br i1 [[TMP67]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2St7St_funcEPS_iPe_l144(ptr [[TMP6]], i32 [[TMP0]], ptr [[TMP7]], i32 [[TMP1]], i32 [[TMP2]], ptr [[VLA]], ptr [[THIS1]], i32 [[TMP9]]) #[[ATTR5]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2St7St_funcEPS_iPe_l144(ptr [[TMP6]], i32 [[TMP0]], ptr [[TMP7]], i32 [[TMP1]], i32 [[TMP2]], ptr [[VLA]], ptr [[THIS1]], i32 [[TMP9]]) #[[ATTR4]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[TMP68:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4 @@ -2744,7 +2709,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2St7St_funcEPS_iPe_l144.omp_outlined -// CHECK19-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[THIS:%.*]], i32 [[VLA2:%.*]], i32 [[VLA4:%.*]], ptr nonnull align 4 dereferenceable(8) [[VLA26:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], ptr [[S:%.*]]) #[[ATTR3]] { +// CHECK19-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[VLA:%.*]], ptr [[VLA1:%.*]], ptr [[THIS:%.*]], i32 [[VLA2:%.*]], i32 [[VLA4:%.*]], ptr nonnull align 4 dereferenceable(8) [[VLA26:%.*]], ptr nonnull align 4 dereferenceable(4) [[N:%.*]], ptr [[S:%.*]]) #[[ATTR2]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2805,10 +2770,3 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: call void @llvm.stackrestore.p0(ptr [[TMP15]]) // CHECK19-NEXT: ret void // -// -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp index d0ae405306b8e..041a28bd87e7c 100644 --- a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp +++ b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp @@ -560,7 +560,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34 -// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 @@ -761,13 +761,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_argument_globali // CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1149,7 +1142,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34 -// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 @@ -1345,13 +1338,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1442,7 +1428,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 0 @@ -1468,7 +1454,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1559,7 +1545,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1663,13 +1649,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@_Z15teams_local_argv // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1760,7 +1739,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 0 @@ -1786,7 +1765,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1875,7 +1854,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1976,13 +1955,6 @@ int main (int argc, char **argv) { // CHECK11-NEXT: ret void // // -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// -// // CHECK17-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK17-SAME: () #[[ATTR0:[0-9]+]] { // CHECK17-NEXT: entry: @@ -2041,7 +2013,7 @@ int main (int argc, char **argv) { // CHECK17-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK17-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK17: omp_offload.failed: -// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK17: omp_offload.cont: // CHECK17-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2061,7 +2033,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2124,7 +2096,7 @@ int main (int argc, char **argv) { // // // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined.omp_outlined -// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK17-NEXT: entry: // CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2201,13 +2173,6 @@ int main (int argc, char **argv) { // CHECK17-NEXT: ret void // // -// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK17-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK17-NEXT: entry: -// CHECK17-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK17-NEXT: ret void -// -// // CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK19-SAME: () #[[ATTR0:[0-9]+]] { // CHECK19-NEXT: entry: @@ -2266,7 +2231,7 @@ int main (int argc, char **argv) { // CHECK19-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK19-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK19: omp_offload.failed: -// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK19: omp_offload.cont: // CHECK19-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -2286,7 +2251,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2347,7 +2312,7 @@ int main (int argc, char **argv) { // // // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined.omp_outlined -// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK19-NEXT: entry: // CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -2421,13 +2386,6 @@ int main (int argc, char **argv) { // CHECK19-NEXT: ret void // // -// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK19-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK19-NEXT: entry: -// CHECK19-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK19-NEXT: ret void -// -// // CHECK25-LABEL: define {{[^@]+}}@main // CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK25-NEXT: entry: @@ -2524,7 +2482,7 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK25-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -2552,7 +2510,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2643,7 +2601,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2748,7 +2706,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK25-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -2824,14 +2782,14 @@ int main (int argc, char **argv) { // CHECK25-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK25-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK25: omp_offload.failed: -// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR4]] +// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR3]] // CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK25: omp_offload.cont: // CHECK25-NEXT: ret i32 0 // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150 -// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[TE_ADDR:%.*]] = alloca i64, align 8 // CHECK25-NEXT: [[TH_ADDR:%.*]] = alloca i64, align 8 @@ -2849,7 +2807,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2912,7 +2870,7 @@ int main (int argc, char **argv) { // // // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined.omp_outlined -// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK25-NEXT: entry: // CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -2988,13 +2946,6 @@ int main (int argc, char **argv) { // CHECK25-NEXT: ret void // // -// CHECK25-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK25-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK25-NEXT: entry: -// CHECK25-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK25-NEXT: ret void -// -// // CHECK27-LABEL: define {{[^@]+}}@main // CHECK27-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK27-NEXT: entry: @@ -3091,7 +3042,7 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0 // CHECK27-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -3119,7 +3070,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3208,7 +3159,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3310,7 +3261,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_ -// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[A:%.*]] = alloca [10 x i32], align 4 @@ -3386,14 +3337,14 @@ int main (int argc, char **argv) { // CHECK27-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0 // CHECK27-NEXT: br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK27: omp_offload.failed: -// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR4]] +// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR3]] // CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK27: omp_offload.cont: // CHECK27-NEXT: ret i32 0 // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150 -// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[TE_ADDR:%.*]] = alloca i32, align 4 // CHECK27-NEXT: [[TH_ADDR:%.*]] = alloca i32, align 4 @@ -3411,7 +3362,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3472,7 +3423,7 @@ int main (int argc, char **argv) { // // // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined.omp_outlined -// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] { +// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] { // CHECK27-NEXT: entry: // CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -3544,10 +3495,3 @@ int main (int argc, char **argv) { // CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]]) // CHECK27-NEXT: ret void // -// -// CHECK27-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK27-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK27-NEXT: entry: -// CHECK27-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK27-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp index ce71a4620facc..4cf8f88b4c084 100644 --- a/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp +++ b/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp @@ -161,7 +161,7 @@ int main (int argc, char **argv) { // CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -182,7 +182,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -247,7 +247,7 @@ int main (int argc, char **argv) { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -338,13 +338,6 @@ int main (int argc, char **argv) { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -404,7 +397,7 @@ int main (int argc, char **argv) { // CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 @@ -425,7 +418,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -488,7 +481,7 @@ int main (int argc, char **argv) { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -575,13 +568,6 @@ int main (int argc, char **argv) { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR4:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -711,7 +697,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 // CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -744,7 +730,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -856,7 +842,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1012,7 +998,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1061,7 +1047,7 @@ int main (int argc, char **argv) { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR4]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret i32 0 @@ -1078,7 +1064,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1143,7 +1129,7 @@ int main (int argc, char **argv) { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1233,13 +1219,6 @@ int main (int argc, char **argv) { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1368,7 +1347,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0 // CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4 @@ -1401,7 +1380,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1515,7 +1494,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1671,7 +1650,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_ -// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4 @@ -1720,7 +1699,7 @@ int main (int argc, char **argv) { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR4]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret i32 0 @@ -1737,7 +1716,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1800,7 +1779,7 @@ int main (int argc, char **argv) { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1885,10 +1864,3 @@ int main (int argc, char **argv) { // CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]]) // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp index b86f1440ec9c9..ab5cbdf1b8c9e 100644 --- a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp +++ b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp @@ -307,7 +307,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -398,7 +398,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -512,7 +512,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR1]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -612,7 +612,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -706,7 +706,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -875,13 +875,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR0]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -1047,7 +1040,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1136,7 +1129,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1246,7 +1239,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR1]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1346,7 +1339,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1438,7 +1431,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1603,13 +1596,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR0]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1734,7 +1720,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75 -// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8 // CHECK9-NEXT: [[TMP:%.*]] = alloca ptr, align 8 @@ -1745,7 +1731,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1812,7 +1798,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR6]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1909,10 +1895,3 @@ int main() { // CHECK9-NEXT: call void @__cxx_global_var_init.2() // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR0]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp index 3cbb5cfc5effa..f91ee759cddf0 100644 --- a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp +++ b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp @@ -145,7 +145,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i64 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i64 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() @@ -162,7 +162,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -228,8 +228,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -247,7 +247,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -327,8 +327,8 @@ int main() { // CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -346,7 +346,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -366,7 +366,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -386,7 +386,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -439,7 +439,7 @@ int main() { // CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR3]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret i32 0 @@ -455,7 +455,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -521,8 +521,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -540,7 +540,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -620,8 +620,8 @@ int main() { // CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP14]], align 8 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK1-NEXT: ] // CHECK1: .omp.reduction.case1: // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -639,7 +639,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -659,7 +659,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -678,13 +678,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -737,7 +730,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i32 [[TMP1]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i32 [[TMP1]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() @@ -754,7 +747,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -818,8 +811,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -837,7 +830,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -915,8 +908,8 @@ int main() { // CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -934,7 +927,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -954,7 +947,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -974,7 +967,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4 @@ -1027,7 +1020,7 @@ int main() { // CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 // CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR3]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret i32 0 @@ -1043,7 +1036,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1107,8 +1100,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1126,7 +1119,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1204,8 +1197,8 @@ int main() { // CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK3-NEXT: ] // CHECK3: .omp.reduction.case1: // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1223,7 +1216,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1243,7 +1236,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func -// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 @@ -1262,13 +1255,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR8:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -1289,7 +1275,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1355,8 +1341,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8 // CHECK9-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1374,7 +1360,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1458,8 +1444,8 @@ int main() { // CHECK9-NEXT: store ptr [[SIVAR2]], ptr [[TMP15]], align 8 // CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ -// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] -// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK9-NEXT: ] // CHECK9: .omp.reduction.case1: // CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4 @@ -1477,7 +1463,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1497,7 +1483,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func -// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -1515,10 +1501,3 @@ int main() { // CHECK9-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 // CHECK9-NEXT: ret void // -// -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// diff --git a/clang/test/OpenMP/teams_private_codegen.cpp b/clang/test/OpenMP/teams_private_codegen.cpp index 175de892c4551..0126545c5915b 100644 --- a/clang/test/OpenMP/teams_private_codegen.cpp +++ b/clang/test/OpenMP/teams_private_codegen.cpp @@ -248,14 +248,14 @@ int main() { // CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 // CHECK1-NEXT: br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48 -// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -265,7 +265,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -297,7 +297,7 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@_ZZN2SSC1ERiENKUlvE_clEv -// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(32) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(32) [[THIS:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -323,14 +323,14 @@ int main() { // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117 -// CHECK1-SAME: () #[[ATTR3]] { +// CHECK1-SAME: () #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined) // CHECK1-NEXT: ret void // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -349,13 +349,6 @@ int main() { // CHECK1-NEXT: ret void // // -// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK1-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK1-NEXT: entry: -// CHECK1-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK1-NEXT: ret void -// -// // CHECK3-LABEL: define {{[^@]+}}@main // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: @@ -441,14 +434,14 @@ int main() { // CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 // CHECK3-NEXT: br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK3: omp_offload.failed: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR5:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR3:[0-9]+]] // CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK3: omp_offload.cont: // CHECK3-NEXT: ret void // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48 -// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -458,7 +451,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -490,7 +483,7 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@_ZZN2SSC1ERiENKUlvE_clEv -// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] align 2 { +// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR1]] align 2 { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 @@ -516,14 +509,14 @@ int main() { // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117 -// CHECK3-SAME: () #[[ATTR3]] { +// CHECK3-SAME: () #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined) // CHECK3-NEXT: ret void // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined -// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -542,13 +535,6 @@ int main() { // CHECK3-NEXT: ret void // // -// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK3-SAME: () #[[ATTR6:[0-9]+]] { -// CHECK3-NEXT: entry: -// CHECK3-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK3-NEXT: ret void -// -// // CHECK9-LABEL: define {{[^@]+}}@main // CHECK9-SAME: () #[[ATTR0:[0-9]+]] { // CHECK9-NEXT: entry: @@ -600,23 +586,23 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l136() #[[ATTR5:[0-9]+]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l136() #[[ATTR4:[0-9]+]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v() // CHECK9-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done1: -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP16]] // @@ -665,7 +651,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l136.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -693,14 +679,14 @@ int main() { // CHECK9-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 0 // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX1]], ptr align 4 [[VAR]], i64 4, i1 false) // CHECK9-NEXT: store i32 3, ptr [[SIVAR]], align 4 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK9-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ARRAYCTOR_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done3: @@ -713,12 +699,12 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK9-SAME: () #[[ATTR1]] comdat { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK9-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -767,22 +753,22 @@ int main() { // CHECK9-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK9-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l86() #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l86() #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done1: -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK9-NEXT: ret i32 [[TMP16]] // @@ -847,7 +833,7 @@ int main() { // CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 // CHECK9-NEXT: br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret void @@ -864,7 +850,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -969,7 +955,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l86.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -995,14 +981,14 @@ int main() { // CHECK9-NEXT: store i32 [[TMP0]], ptr [[ARRAYIDX]], align 128 // CHECK9-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0 // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[ARRAYIDX1]], ptr align 128 [[VAR]], i64 4, i1 false) -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK9-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN2]], i64 2 // CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK9: arraydestroy.body: // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ARRAYCTOR_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1 -// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]] // CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK9: arraydestroy.done3: @@ -1015,7 +1001,7 @@ int main() { // CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: ret void // // @@ -1080,7 +1066,7 @@ int main() { // CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK9: omp_offload.failed: -// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN3SSTIiEC1Ev_l64(ptr [[THIS1]]) #[[ATTR5]] +// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN3SSTIiEC1Ev_l64(ptr [[THIS1]]) #[[ATTR4]] // CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK9: omp_offload.cont: // CHECK9-NEXT: ret void @@ -1097,7 +1083,7 @@ int main() { // // // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN3SSTIiEC1Ev_l64.omp_outlined -// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR4]] { +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR3]] { // CHECK9-NEXT: entry: // CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -1139,13 +1125,6 @@ int main() { // CHECK9-NEXT: ret void // // -// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK9-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK9-NEXT: entry: -// CHECK9-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK9-NEXT: ret void -// -// // CHECK11-LABEL: define {{[^@]+}}@main // CHECK11-SAME: () #[[ATTR0:[0-9]+]] { // CHECK11-NEXT: entry: @@ -1197,23 +1176,23 @@ int main() { // CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK11-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l136() #[[ATTR5:[0-9]+]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l136() #[[ATTR4:[0-9]+]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v() // CHECK11-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done1: -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP16]] // @@ -1262,7 +1241,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l136.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1290,14 +1269,14 @@ int main() { // CHECK11-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX1]], ptr align 4 [[VAR]], i32 4, i1 false) // CHECK11-NEXT: store i32 3, ptr [[SIVAR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ARRAYCTOR_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done3: @@ -1310,12 +1289,12 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v -// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat { +// CHECK11-SAME: () #[[ATTR1]] comdat { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK11-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4 @@ -1364,22 +1343,22 @@ int main() { // CHECK11-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 // CHECK11-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l86() #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l86() #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done1: -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]] // CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4 // CHECK11-NEXT: ret i32 [[TMP16]] // @@ -1444,7 +1423,7 @@ int main() { // CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 // CHECK11-NEXT: br i1 [[TMP20]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48(ptr [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret void @@ -1461,7 +1440,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSC1ERi_l48.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1566,7 +1545,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l86.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1592,14 +1571,14 @@ int main() { // CHECK11-NEXT: store i32 [[TMP0]], ptr [[ARRAYIDX]], align 128 // CHECK11-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[ARRAYIDX1]], ptr align 128 [[VAR]], i32 4, i1 false) -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 // CHECK11-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN2]], i32 2 // CHECK11-NEXT: br label [[ARRAYDESTROY_BODY:%.*]] // CHECK11: arraydestroy.body: // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ARRAYCTOR_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ] // CHECK11-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1 -// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]] // CHECK11-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]] // CHECK11-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]] // CHECK11: arraydestroy.done3: @@ -1612,7 +1591,7 @@ int main() { // CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: ret void // // @@ -1677,7 +1656,7 @@ int main() { // CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 // CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK11: omp_offload.failed: -// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN3SSTIiEC1Ev_l64(ptr [[THIS1]]) #[[ATTR5]] +// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN3SSTIiEC1Ev_l64(ptr [[THIS1]]) #[[ATTR4]] // CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK11: omp_offload.cont: // CHECK11-NEXT: ret void @@ -1694,7 +1673,7 @@ int main() { // // // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN3SSTIiEC1Ev_l64.omp_outlined -// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR4]] { +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR3]] { // CHECK11-NEXT: entry: // CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 // CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 @@ -1735,10 +1714,3 @@ int main() { // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: ret void // -// -// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg -// CHECK11-SAME: () #[[ATTR7:[0-9]+]] { -// CHECK11-NEXT: entry: -// CHECK11-NEXT: call void @__tgt_register_requires(i64 1) -// CHECK11-NEXT: ret void -// diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 2288969ecc95c..4cca4f4d6bc1c 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -342,6 +342,8 @@ class OffloadEntriesInfoManager { OMPTargetGlobalVarEntryNone = 0x3, /// Mark the entry as a declare target indirect global. OMPTargetGlobalVarEntryIndirect = 0x8, + /// Mark the entry as a register requires global. + OMPTargetGlobalRegisterRequires = 0x10, }; /// Kind of device clause for declare target variables @@ -2628,16 +2630,6 @@ class OpenMPIRBuilder { /// \param Name Name of the variable. GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace = 0); - - /// Create a global function to register OpenMP requires flags into the - /// runtime, according to the `Config`. - /// - /// This function should be added to the list of constructors of the - /// compilation unit in order to be called before other OpenMP runtime - /// functions. - /// - /// \param Name Name of the created function. - Function *createRegisterRequires(StringRef Name); }; /// Class to represented the control flow structure of an OpenMP canonical loop. diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 02b333e9ccd56..2e96772772e6e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6602,6 +6602,17 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata( llvm_unreachable("Unsupported entry kind."); } } + + // Emit requires directive globals to a special entry so the runtime can + // register them when the device image is loaded. + // TODO: This reduces the offloading entries to a 32-bit integer. Offloading + // entries should be redesigned to better suit this use-case. + if (Config.hasRequiresFlags() && !Config.isTargetDevice()) + offloading::emitOffloadingEntry( + M, Constant::getNullValue(PointerType::getUnqual(M.getContext())), + /*Name=*/"", + /*Size=*/0, OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires, + Config.getRequiresFlags(), "omp_offloading_entries"); } void TargetRegionEntryInfo::getTargetRegionEntryFnName( @@ -6881,35 +6892,6 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath) { loadOffloadInfoMetadata(*M.get()); } -Function *OpenMPIRBuilder::createRegisterRequires(StringRef Name) { - // Skip the creation of the registration function if this is device codegen - if (Config.isTargetDevice()) - return nullptr; - - Builder.ClearInsertionPoint(); - - // Create registration function prototype - auto *RegFnTy = FunctionType::get(Builder.getVoidTy(), {}); - auto *RegFn = Function::Create( - RegFnTy, GlobalVariable::LinkageTypes::InternalLinkage, Name, M); - RegFn->setSection(".text.startup"); - RegFn->addFnAttr(Attribute::NoInline); - RegFn->addFnAttr(Attribute::NoUnwind); - - // Create registration function body - auto *BB = BasicBlock::Create(M.getContext(), "entry", RegFn); - ConstantInt *FlagsVal = - ConstantInt::getSigned(Builder.getInt64Ty(), Config.getRequiresFlags()); - Function *RTLRegFn = getOrCreateRuntimeFunctionPtr( - omp::RuntimeFunction::OMPRTL___tgt_register_requires); - - Builder.SetInsertPoint(BB); - Builder.CreateCall(RTLRegFn, {FlagsVal}); - Builder.CreateRetVoid(); - - return RegFn; -} - //===----------------------------------------------------------------------===// // OffloadEntriesInfoManager //===----------------------------------------------------------------------===// diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index e79d0bb2f65ae..7b54ae675f0c1 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -6735,44 +6735,4 @@ TEST_F(OpenMPIRBuilderTest, createGPUOffloadEntry) { EXPECT_TRUE(Fn->hasFnAttribute(Attribute::MustProgress)); } -TEST_F(OpenMPIRBuilderTest, CreateRegisterRequires) { - OpenMPIRBuilder OMPBuilder(*M); - OMPBuilder.initialize(); - - OpenMPIRBuilderConfig Config(/* IsTargetDevice = */ false, - /* IsGPU = */ false, - /* OpenMPOffloadMandatory = */ false, - /* HasRequiresReverseOffload = */ true, - /* HasRequiresUnifiedAddress = */ false, - /* HasRequiresUnifiedSharedMemory = */ true, - /* HasRequiresDynamicAllocators = */ false); - OMPBuilder.setConfig(Config); - - auto FName = - OMPBuilder.createPlatformSpecificName({"omp_offloading", "requires_reg"}); - EXPECT_EQ(FName, ".omp_offloading.requires_reg"); - - Function *Fn = OMPBuilder.createRegisterRequires(FName); - EXPECT_NE(Fn, nullptr); - EXPECT_EQ(FName, Fn->getName()); - - EXPECT_EQ(Fn->getSection(), ".text.startup"); - EXPECT_TRUE(Fn->hasInternalLinkage()); - EXPECT_TRUE(Fn->hasFnAttribute(Attribute::NoInline)); - EXPECT_TRUE(Fn->hasFnAttribute(Attribute::NoUnwind)); - EXPECT_EQ(Fn->size(), 1u); - - BasicBlock *Entry = &Fn->getEntryBlock(); - EXPECT_FALSE(Entry->empty()); - EXPECT_EQ(Fn->getReturnType()->getTypeID(), Type::VoidTyID); - - CallInst *Call = &cast(*Entry->begin()); - EXPECT_EQ(Call->getCalledFunction()->getName(), "__tgt_register_requires"); - EXPECT_EQ(Call->getNumOperands(), 2u); - - Value *Flags = Call->getArgOperand(0); - EXPECT_EQ(cast(Flags)->getSExtValue(), - OMPBuilder.Config.getRequiresFlags()); -} - } // namespace diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 78a2ad76a1e3b..6e53d801a0d2f 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2768,27 +2768,6 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute, return success(); } -/// Converts the module-level set of OpenMP requires clauses into LLVM IR using -/// OpenMPIRBuilder. -static LogicalResult -convertRequiresAttr(Operation &op, omp::ClauseRequiresAttr requiresAttr, - LLVM::ModuleTranslation &moduleTranslation) { - auto *ompBuilder = moduleTranslation.getOpenMPBuilder(); - - // No need to read requiresAttr here, because it has already been done in - // translateModuleToLLVMIR(). There, flags are stored in the - // OpenMPIRBuilderConfig object, available to the OpenMPIRBuilder. - auto *regFn = - ompBuilder->createRegisterRequires(ompBuilder->createPlatformSpecificName( - {"omp_offloading", "requires_reg"})); - - // Add registration function as global constructor - if (regFn) - llvm::appendToGlobalCtors(ompBuilder->M, regFn, /* Priority = */ 0); - - return success(); -} - namespace { /// Implementation of the dialect interface that converts operations belonging @@ -2891,7 +2870,7 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( bitEnumContainsAll(flags, Requires::unified_shared_memory)); config.setHasRequiresDynamicAllocators( bitEnumContainsAll(flags, Requires::dynamic_allocators)); - return convertRequiresAttr(*op, requiresAttr, moduleTranslation); + return success(); } return failure(); }) diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 036367b262f07..39a1e036e85c0 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -2744,10 +2744,4 @@ llvm.func @omp_task_if(%boolexpr: i1) { // ----- -// Check that OpenMP requires flags are registered by a global constructor. -// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] -// CHECK-SAME: [{ i32, ptr, ptr } { i32 0, ptr @[[REG_FN:.*]], ptr null }] -// CHECK: define {{.*}} @[[REG_FN]]({{.*}}) -// CHECK-NOT: } -// CHECK: call void @__tgt_register_requires(i64 10) module attributes {omp.requires = #omp} {} diff --git a/openmp/libomptarget/include/Shared/APITypes.h b/openmp/libomptarget/include/Shared/APITypes.h index 94521b4fbb577..e8fc27785b6c2 100644 --- a/openmp/libomptarget/include/Shared/APITypes.h +++ b/openmp/libomptarget/include/Shared/APITypes.h @@ -30,7 +30,7 @@ struct __tgt_offload_entry { char *name; // Name of the function or global size_t size; // Size of the entry info (0 if it is a function) int32_t flags; // Flags associated with the entry, e.g. 'link'. - int32_t reserved; // Reserved, to be used by the runtime library. + int32_t data; // Extra data associated with the entry. }; /// This struct is a record of the device image information diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 3016467b3abdf..c4faa23427f11 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -92,7 +92,9 @@ enum OpenMPOffloadingDeclareTargetFlags { /// Mark the entry global as having a 'link' attribute. OMP_DECLARE_TARGET_LINK = 0x01, /// Mark the entry global as being an indirectly callable function. - OMP_DECLARE_TARGET_INDIRECT = 0x08 + OMP_DECLARE_TARGET_INDIRECT = 0x08, + /// This is an entry corresponding to a requirement to be registered. + OMP_REGISTER_REQUIRES = 0x10, }; enum TargetAllocTy : int32_t { diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp index 0693d4bd6c91e..34f1f4969da30 100644 --- a/openmp/libomptarget/src/PluginManager.cpp +++ b/openmp/libomptarget/src/PluginManager.cpp @@ -199,6 +199,12 @@ static void registerImageIntoTranslationTable(TranslationTable &TT, void PluginManager::registerLib(__tgt_bin_desc *Desc) { PM->RTLsMtx.lock(); + // Add in all the OpenMP requirements associated with this binary. + for (__tgt_offload_entry &Entry : + llvm::make_range(Desc->HostEntriesBegin, Desc->HostEntriesEnd)) + if (Entry.flags == OMP_REGISTER_REQUIRES) + PM->addRequirements(Entry.data); + // Extract the exectuable image and extra information if availible. for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) PM->addDeviceImage(*Desc, Desc->DeviceImages[i]); diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index 8725e5eb55fc9..d2707f39a1aa3 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -33,7 +33,9 @@ using namespace llvm::omp::target::ompt; //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t Flags) { - PM->addRequirements(Flags); + MESSAGE("The %s function has been removed. Old OpenMP requirements will not " + "be handled", + __PRETTY_FUNCTION__); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 821669d21483f..3b3e17f0f311e 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -194,7 +194,7 @@ static int initLibrary(DeviceTy &Device) { Entry.size) != OFFLOAD_SUCCESS) REPORT("Failed to write symbol for USM %s\n", Entry.name); } - } else { + } else if (Entry.addr) { if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) != OFFLOAD_SUCCESS) REPORT("Failed to load kernel %s\n", Entry.name); diff --git a/openmp/libomptarget/test/offloading/requires.c b/openmp/libomptarget/test/offloading/requires.c index cf01a73661d8f..2a129a7ae86dc 100644 --- a/openmp/libomptarget/test/offloading/requires.c +++ b/openmp/libomptarget/test/offloading/requires.c @@ -16,7 +16,34 @@ // --------------------------------------------------------------------------- // Various definitions copied from OpenMP RTL -extern void __tgt_register_requires(int64_t); +typedef struct { + void *addr; + char *name; + size_t size; + int32_t flags; + int32_t data; +} __tgt_offload_entry; + +enum Flags { + OMP_REGISTER_REQUIRES = 0x10, +}; + +typedef struct { + void *ImageStart; + void *ImageEnd; + __tgt_offload_entry *EntriesBegin; + __tgt_offload_entry *EntriesEnd; +} __tgt_device_image; + +typedef struct { + int32_t NumDeviceImages; + __tgt_device_image *DeviceImages; + __tgt_offload_entry *HostEntriesBegin; + __tgt_offload_entry *HostEntriesEnd; +} __tgt_bin_desc; + +void __tgt_register_lib(__tgt_bin_desc *Desc); +void __tgt_unregister_lib(__tgt_bin_desc *Desc); // End of definitions copied from OpenMP RTL. // --------------------------------------------------------------------------- @@ -28,11 +55,17 @@ void run_reg_requires() { // This is the 2nd time this function is called so it should print SUCCESS if // REQ is compatible with `1` and otherwise cause an error. - __tgt_register_requires(1); - __tgt_register_requires(REQ); + __tgt_offload_entry entries[] = {{NULL, "", 0, OMP_REGISTER_REQUIRES, 1}, + {NULL, "", 0, OMP_REGISTER_REQUIRES, REQ}}; + __tgt_device_image image = {NULL, NULL, &entries[0], &entries[1] + 1}; + __tgt_bin_desc bin = {1, &image, &entries[0], &entries[1] + 1}; + + __tgt_register_lib(&bin); printf("SUCCESS"); + __tgt_unregister_lib(&bin); + // clang-format off // GOOD: SUCCESS // BAD: omptarget fatal error 2: '#pragma omp requires reverse_offload' not used consistently! From cc13f3ba45015254075434f0f94a2ea6ff4bc1b4 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 12:37:02 -0500 Subject: [PATCH 126/351] Correctly round FP -> BF16 when SDAG expands such nodes (#82399) We did something pretty naive: - round FP64 -> BF16 by first rounding to FP32 - skip FP32 -> BF16 rounding entirely - taking the top 16 bits of a FP32 which will turn some NaNs into infinities Let's do this in a more principled way by rounding types with more precision than FP32 to FP32 using round-inexact-to-odd which will negate double rounding issues. --- llvm/include/llvm/CodeGen/TargetLowering.h | 13 + llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 10 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 122 + llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 79 + llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 3 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 10 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 19201 +++++++++++++--- .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 14 + .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 4 + llvm/test/CodeGen/AMDGPU/function-args.ll | 95 +- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 224 +- .../isel-amdgpu-cs-chain-preserve-cc.ll | 1462 +- .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 53 + llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 54 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 293 +- llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 2 +- 16 files changed, 17199 insertions(+), 4440 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 612433b54f6e4..f2e00aab8d5da 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5124,6 +5124,19 @@ class TargetLowering : public TargetLoweringBase { /// \returns The expansion result SDValue expandFP_TO_INT_SAT(SDNode *N, SelectionDAG &DAG) const; + /// Truncate Op to ResultVT. If the result is exact, leave it alone. If it is + /// not exact, force the result to be odd. + /// \param ResultVT The type of result. + /// \param Op The value to round. + /// \returns The expansion result + SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, + SelectionDAG &DAG) const; + + /// Expand round(fp) to fp conversion + /// \param N Node to expand + /// \returns The expansion result + SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const; + /// Expand check for floating point class. /// \param ResultVT The type of intrinsic call result. /// \param Op The tested value. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 6272c3093cff6..f5b7752f7ecc8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3217,10 +3217,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } break; case ISD::FP_ROUND: { - EVT VT = Node->getValueType(0); - if (VT.getScalarType() == MVT::bf16) { - Results.push_back( - DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0))); + if ((Tmp1 = TLI.expandFP_ROUND(Node, DAG))) { + Results.push_back(Tmp1); break; } @@ -3293,6 +3291,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { if (Op.getValueType() != MVT::f32) Op = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); + // Certain SNaNs will turn into infinities if we do a simple shift right. + if (!DAG.isKnownNeverSNaN(Op)) { + Op = DAG.getNode(ISD::FCANONICALIZE, dl, MVT::f32, Op, Node->getFlags()); + } Op = DAG.getNode( ISD::SRL, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op), DAG.getConstant(16, dl, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f689cd120ecbd..d059dc66d0588 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10855,6 +10855,128 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node, return DAG.getSelect(dl, DstVT, IsNan, ZeroInt, Select); } +SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op, + const SDLoc &dl, + SelectionDAG &DAG) const { + EVT OperandVT = Op.getValueType(); + if (OperandVT.getScalarType() == ResultVT.getScalarType()) + return Op; + EVT ResultIntVT = ResultVT.changeTypeToInteger(); + // We are rounding binary64/binary128 -> binary32 -> bfloat16. This + // can induce double-rounding which may alter the results. We can + // correct for this using a trick explained in: Boldo, Sylvie, and + // Guillaume Melquiond. "When double rounding is odd." 17th IMACS + // World Congress. 2005. + unsigned BitSize = OperandVT.getScalarSizeInBits(); + EVT WideIntVT = OperandVT.changeTypeToInteger(); + SDValue OpAsInt = DAG.getBitcast(WideIntVT, Op); + SDValue SignBit = + DAG.getNode(ISD::AND, dl, WideIntVT, OpAsInt, + DAG.getConstant(APInt::getSignMask(BitSize), dl, WideIntVT)); + SDValue AbsWide; + if (isOperationLegalOrCustom(ISD::FABS, OperandVT)) { + AbsWide = DAG.getNode(ISD::FABS, dl, OperandVT, Op); + } else { + SDValue ClearedSign = DAG.getNode( + ISD::AND, dl, WideIntVT, OpAsInt, + DAG.getConstant(APInt::getSignedMaxValue(BitSize), dl, WideIntVT)); + AbsWide = DAG.getBitcast(OperandVT, ClearedSign); + } + SDValue AbsNarrow = DAG.getFPExtendOrRound(AbsWide, dl, ResultVT); + SDValue AbsNarrowAsWide = DAG.getFPExtendOrRound(AbsNarrow, dl, OperandVT); + + // We can keep the narrow value as-is if narrowing was exact (no + // rounding error), the wide value was NaN (the narrow value is also + // NaN and should be preserved) or if we rounded to the odd value. + SDValue NarrowBits = DAG.getNode(ISD::BITCAST, dl, ResultIntVT, AbsNarrow); + SDValue One = DAG.getConstant(1, dl, ResultIntVT); + SDValue NegativeOne = DAG.getAllOnesConstant(dl, ResultIntVT); + SDValue And = DAG.getNode(ISD::AND, dl, ResultIntVT, NarrowBits, One); + EVT ResultIntVTCCVT = getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), And.getValueType()); + SDValue Zero = DAG.getConstant(0, dl, ResultIntVT); + SDValue AlreadyOdd = DAG.getSetCC(dl, ResultIntVTCCVT, And, Zero, ISD::SETNE); + + EVT WideSetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + AbsWide.getValueType()); + SDValue KeepNarrow = + DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETUEQ); + KeepNarrow = DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd); + // We morally performed a round-down if `abs_narrow` is smaller than + // `abs_wide`. + SDValue NarrowIsRd = + DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETOGT); + // If the narrow value is odd or exact, pick it. + // Otherwise, narrow is even and corresponds to either the rounded-up + // or rounded-down value. If narrow is the rounded-down value, we want + // the rounded-up value as it will be odd. + SDValue Adjust = DAG.getSelect(dl, ResultIntVT, NarrowIsRd, One, NegativeOne); + Adjust = DAG.getSelect(dl, ResultIntVT, KeepNarrow, Zero, Adjust); + int ShiftAmount = BitSize - ResultVT.getScalarSizeInBits(); + SDValue ShiftCnst = DAG.getShiftAmountConstant(ShiftAmount, WideIntVT, dl); + SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst); + SignBit = DAG.getNode(ISD::TRUNCATE, dl, ResultIntVT, SignBit); + Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Adjust, SignBit); + return DAG.getNode(ISD::BITCAST, dl, ResultVT, Op); +} + +SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const { + assert(Node->getOpcode() == ISD::FP_ROUND && "Unexpected opcode!"); + SDValue Op = Node->getOperand(0); + EVT VT = Node->getValueType(0); + SDLoc dl(Node); + if (VT.getScalarType() == MVT::bf16) { + if (Node->getConstantOperandVal(1) == 1) { + return DAG.getNode(ISD::FP_TO_BF16, dl, VT, Node->getOperand(0)); + } + EVT OperandVT = Op.getValueType(); + SDValue IsNaN = DAG.getSetCC( + dl, + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), OperandVT), + Op, Op, ISD::SETUO); + + // We are rounding binary64/binary128 -> binary32 -> bfloat16. This + // can induce double-rounding which may alter the results. We can + // correct for this using a trick explained in: Boldo, Sylvie, and + // Guillaume Melquiond. "When double rounding is odd." 17th IMACS + // World Congress. 2005. + EVT F32 = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; + EVT I32 = F32.changeTypeToInteger(); + Op = expandRoundInexactToOdd(F32, Op, dl, DAG); + Op = DAG.getNode(ISD::BITCAST, dl, I32, Op); + + // Extract the sign bit. + SDValue SignBit = + DAG.getNode(ISD::AND, dl, I32, Op, + DAG.getConstant(APInt::getSignMask(32), dl, I32)); + // Set the quiet bit. + SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBit, + DAG.getConstant(0x400000, dl, I32)); + + // Factor in the contribution of the low 16 bits. + SDValue One = DAG.getConstant(1, dl, I32); + SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Op, + DAG.getShiftAmountConstant(16, I32, dl)); + Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One); + SDValue RoundingBias = + DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb); + SDValue Add = DAG.getNode(ISD::ADD, dl, I32, Op, RoundingBias); + + // Don't round if we had a NaN, we don't want to turn 0x7fffffff into + // 0x80000000. + Op = DAG.getSelect(dl, I32, IsNaN, NaN, Add); + + // Now that we have rounded, shift the bits into position. + Op = DAG.getNode(ISD::SRL, dl, I32, Op, + DAG.getShiftAmountConstant(16, I32, dl)); + Op = DAG.getNode(ISD::BITCAST, dl, I32, Op); + EVT I16 = I32.isVector() ? I32.changeVectorElementType(MVT::i16) : MVT::i16; + Op = DAG.getNode(ISD::TRUNCATE, dl, I16, Op); + return DAG.getNode(ISD::BITCAST, dl, VT, Op); + } + return SDValue(); +} + SDValue TargetLowering::expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const { assert(Node->getOpcode() == ISD::VECTOR_SPLICE && "Unexpected opcode!"); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7f58b312e7a20..ef3b61fbd0dea 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -776,6 +776,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, AddPromotedToType(Op, MVT::bf16, MVT::f32); } + if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) { + setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); + } + if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom); + setOperationAction(ISD::BF16_TO_FP, MVT::f64, Custom); + } + // sm_80 only has conversions between f32 and bf16. Custom lower all other // bf16 conversions. if (STI.hasBF16Math() && @@ -2465,6 +2474,72 @@ SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, return Op; } +SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op, + SelectionDAG &DAG) const { + EVT NarrowVT = Op.getValueType(); + SDValue Wide = Op.getOperand(0); + EVT WideVT = Wide.getValueType(); + if (NarrowVT.getScalarType() == MVT::bf16) { + const TargetLowering *TLI = STI.getTargetLowering(); + if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) { + return TLI->expandFP_ROUND(Op.getNode(), DAG); + } + if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { + // This combination was the first to support f32 -> bf16. + if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) { + if (WideVT.getScalarType() == MVT::f32) { + return Op; + } + if (WideVT.getScalarType() == MVT::f64) { + SDLoc Loc(Op); + // Round-inexact-to-odd f64 to f32, then do the final rounding using + // the hardware f32 -> bf16 instruction. + SDValue rod = TLI->expandRoundInexactToOdd( + WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32) + : MVT::f32, + Wide, Loc, DAG); + return DAG.getFPExtendOrRound(rod, Loc, NarrowVT); + } + } + return TLI->expandFP_ROUND(Op.getNode(), DAG); + } + } + + // Everything else is considered legal. + return Op; +} + +SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + SDValue Narrow = Op.getOperand(0); + EVT NarrowVT = Narrow.getValueType(); + EVT WideVT = Op.getValueType(); + if (NarrowVT.getScalarType() == MVT::bf16) { + if (WideVT.getScalarType() == MVT::f32 && + (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) { + SDLoc Loc(Op); + return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow); + } + if (WideVT.getScalarType() == MVT::f64 && + (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { + EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32) + : MVT::f32; + EVT F64 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f64) + : MVT::f64; + SDLoc Loc(Op); + if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) { + Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow); + } else { + Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow); + } + return DAG.getNode(ISD::FP_EXTEND, Loc, F64, Op); + } + } + + // Everything else is considered legal. + return Op; +} + static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); if (Op.getValueType() != MVT::v2i16) @@ -2527,6 +2602,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FP_ROUND: + return LowerFP_ROUND(Op, DAG); + case ISD::FP_EXTEND: + return LowerFP_EXTEND(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VASTART: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 5d3fd992812ef..cf1d458076691 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -618,6 +618,9 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 631136ad62146..40d82ebecbed3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -662,7 +662,7 @@ let hasSideEffects = false in { // bf16->f32 was introduced early. [hasPTX<71>, hasSM<80>], // bf16->everything else needs sm90/ptx78 - [hasPTX<78>, hasSM<90>])>; + [hasPTX<78>, hasSM<90>])>; def _f32 : NVPTXInst<(outs RC:$dst), (ins Float32Regs:$src, CvtMode:$mode), @@ -3647,7 +3647,7 @@ def : Pat<(f16 (fpround Float32Regs:$a)), // fpround f32 -> bf16 def : Pat<(bf16 (fpround Float32Regs:$a)), - (CVT_bf16_f32 Float32Regs:$a, CvtRN)>; + (CVT_bf16_f32 Float32Regs:$a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>; // fpround f64 -> f16 def : Pat<(f16 (fpround Float64Regs:$a)), @@ -3655,7 +3655,7 @@ def : Pat<(f16 (fpround Float64Regs:$a)), // fpround f64 -> bf16 def : Pat<(bf16 (fpround Float64Regs:$a)), - (CVT_bf16_f64 Float64Regs:$a, CvtRN)>; + (CVT_bf16_f64 Float64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; // fpround f64 -> f32 def : Pat<(f32 (fpround Float64Regs:$a)), (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; @@ -3671,7 +3671,7 @@ def : Pat<(f32 (fpextend (f16 Int16Regs:$a))), def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), (CVT_f32_bf16 Int16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; def : Pat<(f32 (fpextend (bf16 Int16Regs:$a))), - (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>; + (CVT_f32_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), @@ -3679,7 +3679,7 @@ def : Pat<(f64 (fpextend (f16 Int16Regs:$a))), // fpextend bf16 -> f64 def : Pat<(f64 (fpextend (bf16 Int16Regs:$a))), - (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>; + (CVT_f64_bf16 Int16Regs:$a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>; // fpextend f32 -> f64 def : Pat<(f64 (fpextend Float32Regs:$a)), diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 387c4a16a008a..e841a8867fc52 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -924,8 +924,10 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 @@ -937,7 +939,9 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-LABEL: v_store_global_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 @@ -980,13 +984,16 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -995,13 +1002,16 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-LABEL: v_store_global_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1047,9 +1057,13 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 @@ -1062,8 +1076,12 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-LABEL: v_store_global_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 @@ -1109,28 +1127,44 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 +; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -1175,53 +1209,85 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v18, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v19, v12, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 +; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 +; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 +; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v10, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v8, 16 +; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 @@ -1269,49 +1335,82 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 ; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v0, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v1, v12, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v0, v10, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_alignbit_b32 v10, v0, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; GCN-NEXT: v_alignbit_b32 v9, v0, v22, 16 -; GCN-NEXT: v_alignbit_b32 v8, v1, v20, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_alignbit_b32 v7, v0, v18, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v27 -; GCN-NEXT: v_alignbit_b32 v6, v0, v16, 16 -; GCN-NEXT: v_alignbit_b32 v16, v1, v28, 16 -; GCN-NEXT: v_alignbit_b32 v15, v14, v26, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_alignbit_b32 v14, v0, v24, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: v_alignbit_b32 v17, v17, v30, 16 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16 ; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -1320,49 +1419,82 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-LABEL: v_store_global_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v10, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX7-NEXT: v_alignbit_b32 v10, v0, v8, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v9, v0, v22, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v12, v1, v12, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; GFX7-NEXT: v_alignbit_b32 v7, v0, v18, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX7-NEXT: v_alignbit_b32 v8, v1, v20, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v27 -; GFX7-NEXT: v_alignbit_b32 v6, v0, v16, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX7-NEXT: v_alignbit_b32 v16, v1, v28, 16 -; GFX7-NEXT: v_alignbit_b32 v15, v14, v26, 16 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v24, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 +; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 +; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 +; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_alignbit_b32 v17, v17, v30, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1420,301 +1552,428 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v64bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v21, v21, v20, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 +; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v20, v19, v18, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_alignbit_b32 v19, v19, v16, 16 +; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_alignbit_b32 v11, v8, v10, 16 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v9, v12, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v13, 16 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_alignbit_b32 v11, v8, v10, 16 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v9, v12, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 ; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v13, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v12, v4, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v28, 16 -; GCN-NEXT: v_alignbit_b32 v7, v14, v26, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_alignbit_b32 v6, v15, v24, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16 +; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22 +; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v9, v9, v30, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v11, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16 +; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16 +; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_alignbit_b32 v11, v11, v0, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v1, v14, 16 -; GCN-NEXT: v_alignbit_b32 v22, v15, v16, 16 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_alignbit_b32 v21, v20, v0, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_alignbit_b32 v20, v15, v0, 16 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16 +; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v19, v0, v14, 16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[17:18], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[17:18], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v64bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v14, 16 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v12, v1, v12, 16 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v10, 16 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX7-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GFX7-NEXT: v_alignbit_b32 v8, v21, v20, 16 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: v_alignbit_b32 v9, v23, v22, 16 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_alignbit_b32 v23, v6, v7, 16 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_alignbit_b32 v22, v15, v31, 16 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v32 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_alignbit_b32 v21, v20, v33, 16 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v34 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GFX7-NEXT: v_alignbit_b32 v7, v6, v18, 16 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16 ; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_alignbit_b32 v20, v32, v14, 16 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: buffer_store_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:112 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v16, 16 -; GFX7-NEXT: v_alignbit_b32 v16, v15, v28, 16 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX7-NEXT: v_alignbit_b32 v17, v14, v30, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v27 -; GFX7-NEXT: v_alignbit_b32 v15, v14, v26, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v25 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v24, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v31 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_alignbit_b32 v21, v19, v32, 16 -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_alignbit_b32 v20, v19, v34, 16 -; GFX7-NEXT: s_waitcnt vmcnt(11) -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_alignbit_b32 v19, v19, v37, 16 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37 +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 +; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 +; GFX7-NEXT: s_waitcnt vmcnt(6) +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112 +; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(9) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28 +; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_waitcnt vmcnt(9) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 ; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_alignbit_b32 v25, v22, v23, 16 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39 +; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v38 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v39 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_alignbit_b32 v23, v18, v48, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16 +; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16 +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 ; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v28 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 ; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_alignbit_b32 v22, v22, v29, 16 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 -; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16 +; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 +; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 +; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 +; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16 +; GFX7-NEXT: s_waitcnt vmcnt(14) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(13) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_waitcnt vmcnt(12) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16 +; GFX7-NEXT: s_waitcnt vmcnt(11) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_alignbit_b32 v18, v22, v18, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16 ; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v27 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_alignbit_b32 v25, v22, v30, 16 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_alignbit_b32 v24, v23, v32, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_alignbit_b32 v23, v22, v28, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v29 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34 +; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16 +; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_alignbit_b32 v22, v22, v34, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:64 -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80 +; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1895,6 +2154,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -1909,6 +2169,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1919,6 +2180,13 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: flat_store_short v[2:3], v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1928,7 +2196,14 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1937,7 +2212,13 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1945,7 +2226,14 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load float, ptr addrspace(1) %in @@ -1990,7 +2278,24 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5] +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5] +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v5, v4, v7 +; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: flat_store_short v[2:3], v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2000,8 +2305,25 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_brev_b32 s8, 1 +; GFX9-NEXT: s_movk_i32 s9, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5] +; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5] +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4 +; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2011,7 +2333,22 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5] +; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5] +; GFX10-NEXT: s_or_b32 s4, s4, vcc_lo +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4 +; GFX10-NEXT: s_mov_b32 s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 +; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2020,7 +2357,27 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX11-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] +; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5] +; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0 +; GFX11-NEXT: s_mov_b32 s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 +; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load double, ptr addrspace(1) %in @@ -2468,11 +2825,12 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2481,10 +2839,11 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2522,8 +2881,10 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 @@ -2535,7 +2896,9 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7-LABEL: test_arg_store_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 @@ -2578,13 +2941,16 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -2593,13 +2959,16 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7-LABEL: test_arg_store_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2645,9 +3014,13 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 @@ -2660,8 +3033,12 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7-LABEL: test_arg_store_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 @@ -2707,28 +3084,44 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 +; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -2773,53 +3166,85 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v18, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v19, v12, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 +; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 +; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 +; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_arg_store_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v10, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v8, 16 +; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 @@ -2867,12 +3292,12 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GCN-LABEL: test_inreg_arg_store: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s34, s4, 16 -; GCN-NEXT: s_mov_b32 s38, 0 ; GCN-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NEXT: s_mov_b32 s38, 0 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GCN-NEXT: s_mov_b32 s36, s38 ; GCN-NEXT: s_mov_b32 s37, s38 -; GCN-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2880,12 +3305,12 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX7-LABEL: test_inreg_arg_store: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s34, s4, 16 ; GFX7-NEXT: s_mov_b32 s38, 0 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7-NEXT: s_mov_b32 s39, 0xf000 ; GFX7-NEXT: s_mov_b32 s36, s38 ; GFX7-NEXT: s_mov_b32 s37, s38 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2927,7 +3352,8 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GCN-LABEL: test_byval: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2935,7 +3361,8 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX7-LABEL: test_byval: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2974,6 +3401,7 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GCN-LABEL: test_sret: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -2982,6 +3410,7 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX7-LABEL: test_sret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3371,6 +3800,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -3401,6 +3831,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3556,9 +3987,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 ; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen @@ -3590,6 +4023,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3749,9 +4184,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -3784,9 +4222,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3953,13 +4394,17 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4 ; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen @@ -3995,12 +4440,16 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v6, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4169,6 +4618,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -4177,13 +4641,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8 ; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen @@ -4227,28 +4684,36 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v10, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4419,85 +4884,101 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_writelane_b32 v28, s30, 0 -; GCN-NEXT: v_writelane_b32 v28, s31, 1 +; GCN-NEXT: v_writelane_b32 v21, s30, 0 +; GCN-NEXT: v_writelane_b32 v21, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 ; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 ; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 ; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 22, v16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 18, v16 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 14, v16 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 12, v16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 10, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 8, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 2, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v11, v21, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v10, v22, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v9, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v8, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v7, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v6, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v5, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v1, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v28, 1 -; GCN-NEXT: v_readlane_b32 s30, v28, 0 +; GCN-NEXT: v_readlane_b32 s31, v21, 1 +; GCN-NEXT: v_readlane_b32 s30, v21, 0 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s8 @@ -4521,60 +5002,76 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v18, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4772,6 +5269,7 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GCN-LABEL: test_alloca_load_store_ret: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -4783,6 +5281,7 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX7-LABEL: test_alloca_load_store_ret: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4870,7 +5369,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 @@ -4908,6 +5407,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen @@ -4933,6 +5433,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen @@ -8466,6 +8967,8 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fadd_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 @@ -8475,6 +8978,8 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fadd_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 @@ -8487,6 +8992,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -8496,6 +9008,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8504,7 +9023,13 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -8513,8 +9038,16 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd bfloat %a, %b @@ -8525,6 +9058,10 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_fadd_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -8538,6 +9075,10 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -8553,10 +9094,24 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -8566,10 +9121,23 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8581,9 +9149,20 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v2bf16: @@ -8593,11 +9172,24 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <2 x bfloat> %a, %b ret <2 x bfloat> %op @@ -8607,6 +9199,12 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_fadd_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -8624,6 +9222,12 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -8644,12 +9248,34 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -8661,12 +9287,31 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -8675,16 +9320,32 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-LABEL: v_fadd_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = fadd <3 x bfloat> %a, %b @@ -8695,6 +9356,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_fadd_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -8716,6 +9385,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -8739,17 +9416,46 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 @@ -8760,16 +9466,41 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 @@ -8781,17 +9512,38 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v4bf16: @@ -8799,19 +9551,45 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %op @@ -8821,6 +9599,22 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_fadd_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -8858,6 +9652,22 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -8897,31 +9707,88 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX8-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v9, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 @@ -8934,28 +9801,77 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -8968,65 +9884,157 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v9, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 +; GFX10-NEXT: v_add_f32_e32 v9, v11, v9 +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; GFX10-NEXT: v_add_f32_e32 v6, v11, v6 +; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_add_f32_e32 v10, v11, v10 -; GFX10-NEXT: v_add_f32_e32 v11, v13, v12 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 +; GFX10-NEXT: v_add_f32_e32 v5, v15, v13 +; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v11, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v9, v11, v9 +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_dual_add_f32 v10, v11, v10 :: v_dual_add_f32 v11, v13, v12 +; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v5, v15, v13 +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <8 x bfloat> %a, %b ret <8 x bfloat> %op @@ -9036,36 +10044,67 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_fadd_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_add_f32_e32 v14, v14, v30 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_add_f32_e32 v13, v13, v29 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_add_f32_e32 v12, v12, v28 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_add_f32_e32 v11, v11, v27 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_add_f32_e32 v10, v10, v26 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_add_f32_e32 v9, v9, v25 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_add_f32_e32 v8, v8, v24 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_add_f32_e32 v7, v7, v23 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_add_f32_e32 v6, v6, v22 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_add_f32_e32 v5, v5, v21 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_add_f32_e32 v4, v4, v20 @@ -9098,7 +10137,8 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_add_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -9107,12 +10147,41 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -9129,18 +10198,19 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 @@ -9149,11 +10219,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v19 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v18 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -9161,6 +10232,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_add_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -9169,9 +10244,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 -; GFX7-NEXT: v_add_f32_e32 v15, v15, v16 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9180,51 +10252,165 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX8-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX8-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX8-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX8-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -9232,7 +10418,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v9, v17, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 @@ -9249,52 +10434,149 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 @@ -9313,119 +10595,297 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_add_f32_e32 v16, v17, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX10-NEXT: v_add_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX10-NEXT: v_add_f32_e32 v14, v19, v18 +; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 +; GFX10-NEXT: v_add_f32_e32 v17, v20, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 ; GFX10-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX10-NEXT: v_add_f32_e32 v13, v21, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo +; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v13, v19, v18 +; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v18, v20, v19 +; GFX10-NEXT: v_add_f32_e32 v12, v18, v12 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v18, v19, v18 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v19, v22, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_add_f32_e32 v19, v20, v19 -; GFX10-NEXT: v_add_f32_e32 v20, v22, v21 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX10-NEXT: v_add_f32_e32 v9, v22, v20 +; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v16bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_add_f32_e32 v13, v21, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX11-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 -; GFX11-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v17, v18, v17 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff +; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11 -; GFX11-NEXT: v_add_f32_e32 v14, v19, v18 +; GFX11-NEXT: v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 +; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18 +; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e32 v12, v18, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v18, v19, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX11-NEXT: v_dual_add_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v19, v22, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v19, v20, v19 :: v_dual_add_f32 v20, v22, v21 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9 +; GFX11-NEXT: v_add_f32_e32 v9, v22, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <16 x bfloat> %a, %b ret <16 x bfloat> %op @@ -9435,166 +10895,230 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_fadd_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_add_f32_e32 v31, v32, v31 +; GCN-NEXT: v_add_f32_e32 v31, v31, v32 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_add_f32_e32 v30, v30, v32 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v29, v29, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_add_f32_e32 v29, v29, v32 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_add_f32_e32 v28, v28, v32 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v27, v27, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_add_f32_e32 v27, v27, v32 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_add_f32_e32 v26, v26, v32 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v25, v25, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_add_f32_e32 v25, v25, v32 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_add_f32_e32 v24, v24, v32 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v23, v23, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_add_f32_e32 v23, v23, v32 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_add_f32_e32 v22, v22, v32 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v21, v21, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_add_f32_e32 v21, v21, v32 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_add_f32_e32 v20, v20, v32 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v19, v19, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_add_f32_e32 v19, v19, v32 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_add_f32_e32 v18, v18, v32 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v17, v17, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_add_f32_e32 v17, v17, v32 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_add_f32_e32 v16, v16, v32 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v15, v15, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_add_f32_e32 v15, v15, v32 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_add_f32_e32 v14, v14, v32 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v13, v13, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_add_f32_e32 v13, v13, v32 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_add_f32_e32 v12, v12, v32 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v11, v11, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_add_f32_e32 v11, v11, v32 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_add_f32_e32 v10, v10, v32 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v9, v9, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_add_f32_e32 v9, v9, v32 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_add_f32_e32 v8, v8, v32 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v7, v7, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_add_f32_e32 v7, v7, v32 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_add_f32_e32 v6, v6, v32 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v5, v5, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_add_f32_e32 v5, v5, v32 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_add_f32_e32 v4, v4, v32 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v3, v3, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_add_f32_e32 v3, v3, v32 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_add_f32_e32 v2, v2, v32 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v1, v1, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_add_f32_e32 v1, v1, v32 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_add_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -9634,197 +11158,261 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-LABEL: v_fadd_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_add_f32_e32 v31, v31, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -9835,114 +11423,329 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX8-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_add_f32_e32 v31, v32, v31 -; GFX8-NEXT: v_add_f32_e32 v30, v14, v30 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 +; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX8-NEXT: v_add_f32_e32 v32, v32, v30 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_add_f32_e32 v14, v32, v14 ; GFX8-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX8-NEXT: v_add_f32_e32 v33, v33, v34 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX8-NEXT: v_add_f32_e32 v30, v15, v30 +; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX8-NEXT: v_add_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_add_f32_e32 v29, v32, v29 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 ; GFX8-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 +; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX8-NEXT: v_add_f32_e32 v28, v33, v28 +; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_add_f32_e32 v28, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX8-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX8-NEXT: v_add_f32_e32 v27, v33, v27 +; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_add_f32_e32 v27, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX8-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX8-NEXT: v_add_f32_e32 v26, v33, v26 +; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_add_f32_e32 v26, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX8-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 +; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX8-NEXT: v_add_f32_e32 v25, v33, v25 +; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX8-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX8-NEXT: v_add_f32_e32 v25, v32, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX8-NEXT: v_add_f32_e32 v32, v32, v33 -; GFX8-NEXT: v_add_f32_e32 v15, v15, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX8-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 +; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX8-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX8-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX8-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -9951,8 +11754,13 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -9961,8 +11769,13 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v32bf16: @@ -9970,110 +11783,296 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 +; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 +; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add_f32_e32 v29, v32, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add_f32_e32 v28, v32, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add_f32_e32 v27, v32, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add_f32_e32 v26, v32, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX9-NEXT: v_add_f32_e32 v25, v32, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add_f32_e32 v32, v32, v33 -; GFX9-NEXT: v_add_f32_e32 v15, v15, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 @@ -10082,7 +12081,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4 +; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v32bf16: @@ -10097,32 +12103,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -10141,29 +12125,28 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_add_f32_e32 v37, v38, v37 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_add_f32_e32 v12, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_add_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX10-NEXT: v_add_f32_e32 v25, v54, v53 ; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 @@ -10172,36 +12155,220 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_add_f32_e32 v23, v66, v65 ; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX10-NEXT: v_add_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_add_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_add_f32_e32 v29, v29, v36 -; GFX10-NEXT: v_add_f32_e32 v28, v28, v38 -; GFX10-NEXT: v_add_f32_e32 v27, v27, v48 -; GFX10-NEXT: v_add_f32_e32 v26, v26, v50 +; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 +; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 +; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_add_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX10-NEXT: v_add_f32_e32 v17, v26, v50 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 +; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 +; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 +; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX10-NEXT: v_add_f32_e32 v19, v28, v38 +; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 +; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 +; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 +; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 +; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_add_f32_e32 v21, v30, v34 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302 +; GFX10-NEXT: v_add_f32_e32 v20, v29, v36 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 +; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 +; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 +; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 +; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 +; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 +; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 +; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 +; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 +; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 +; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 +; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 +; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 +; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 +; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 +; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 +; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 +; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 +; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 +; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff +; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 +; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 +; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff +; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 +; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 +; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 +; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 +; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 +; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff +; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 +; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 +; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 +; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff +; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX10-NEXT: v_add_f32_e32 v16, v32, v16 -; GFX10-NEXT: v_add_f32_e32 v15, v15, v17 -; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX10-NEXT: v_add_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_add_f32_e32 v15, v15, v18 +; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 +; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v32bf16: @@ -10212,102 +12379,269 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_dual_add_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11-NEXT: v_add_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 +; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v18 :: v_dual_add_f32 v3, v3, v19 -; GFX11-NEXT: v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX11-NEXT: v_add_f32_e32 v26, v52, v51 -; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX11-NEXT: v_add_f32_e32 v25, v54, v53 -; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_add_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_add_f32_e32 v18, v84, v83 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11-NEXT: v_dual_add_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_add_f32_e32 v20, v80, v71 +; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_add_f32_e32 v25, v54, v53 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX11-NEXT: v_add_f32_e32 v23, v66, v65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_add_f32_e32 v28, v48, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_add_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_add_f32 v26, v52, v51 +; GFX11-NEXT: v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_add_f32_e32 v37, v86, v85 -; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33 -; GFX11-NEXT: v_dual_add_f32 v34, v80, v71 :: v_dual_add_f32 v35, v82, v81 -; GFX11-NEXT: v_add_f32_e32 v36, v84, v83 -; GFX11-NEXT: v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v15, v15, v17 -; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302 -; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302 -; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 +; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff +; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff +; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 -; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v17, v32, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v15, v15, v18 +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <32 x bfloat> %a, %b ret <32 x bfloat> %op @@ -10317,6 +12651,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GCN-LABEL: v_fadd_bf16_fpimm_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -10325,6 +12660,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX7-LABEL: v_fadd_bf16_fpimm_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -10335,6 +12671,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10343,6 +12686,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10350,7 +12700,13 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10358,8 +12714,16 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %add = fadd bfloat %arg0, 1.0 @@ -10370,6 +12734,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GCN-LABEL: v_fadd_bf16_fpimm_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -10378,6 +12743,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX7-LABEL: v_fadd_bf16_fpimm_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -10388,6 +12754,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10396,6 +12769,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10403,7 +12783,13 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10411,8 +12797,16 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %add = fadd bfloat %arg0, 42.0 @@ -10423,6 +12817,8 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fsub_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -10432,6 +12828,8 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fsub_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -10444,6 +12842,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10453,6 +12858,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10461,7 +12873,13 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10470,8 +12888,16 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fsub bfloat %a, %b @@ -10482,6 +12908,10 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_fsub_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -10495,6 +12925,10 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-LABEL: v_fsub_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -10510,10 +12944,24 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -10523,10 +12971,23 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10538,9 +12999,20 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fsub_v2bf16: @@ -10550,11 +13022,24 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fsub <2 x bfloat> %a, %b ret <2 x bfloat> %op @@ -10564,6 +13049,12 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_fsub_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -10581,6 +13072,12 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-LABEL: v_fsub_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -10601,12 +13098,34 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -10618,12 +13137,31 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -10632,16 +13170,32 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-LABEL: v_fsub_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = fsub <3 x bfloat> %a, %b @@ -10652,6 +13206,14 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_fsub_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -10673,6 +13235,14 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-LABEL: v_fsub_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -10696,17 +13266,46 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 @@ -10717,16 +13316,41 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 @@ -10738,17 +13362,38 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fsub_v4bf16: @@ -10756,19 +13401,45 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fsub <4 x bfloat> %a, %b ret <4 x bfloat> %op @@ -10778,6 +13449,8 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fmul_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -10787,6 +13460,8 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fmul_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -10799,6 +13474,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -10808,6 +13490,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10816,7 +13505,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -10825,8 +13520,16 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul bfloat %a, %b @@ -10837,6 +13540,10 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_fmul_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -10850,6 +13557,10 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -10865,10 +13576,24 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -10878,10 +13603,23 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10893,9 +13631,20 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v2bf16: @@ -10905,11 +13654,24 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <2 x bfloat> %a, %b ret <2 x bfloat> %op @@ -10919,6 +13681,12 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_fmul_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -10936,6 +13704,12 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -10956,12 +13730,34 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -10973,12 +13769,31 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -10987,16 +13802,32 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-LABEL: v_fmul_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = fmul <3 x bfloat> %a, %b @@ -11007,6 +13838,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_fmul_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -11028,6 +13867,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -11051,17 +13898,46 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 @@ -11072,16 +13948,41 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 @@ -11093,17 +13994,38 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v4bf16: @@ -11111,19 +14033,45 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-NEXT: v_mul_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <4 x bfloat> %a, %b ret <4 x bfloat> %op @@ -11133,6 +14081,22 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_fmul_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -11170,6 +14134,22 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -11209,31 +14189,88 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 @@ -11246,28 +14283,77 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -11280,65 +14366,157 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_mul_f32_e32 v9, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 +; GFX10-NEXT: v_mul_f32_e32 v9, v11, v9 +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; GFX10-NEXT: v_mul_f32_e32 v6, v11, v6 +; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_mul_f32_e32 v10, v11, v10 -; GFX10-NEXT: v_mul_f32_e32 v11, v13, v12 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 +; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13 +; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v11, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mul_f32_e32 v9, v11, v9 +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 +; GFX11-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_dual_mul_f32 v10, v11, v10 :: v_dual_mul_f32 v11, v13, v12 +; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_mul_f32_e32 v5, v15, v13 +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <8 x bfloat> %a, %b ret <8 x bfloat> %op @@ -11348,36 +14526,67 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_fmul_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v14, v14, v30 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_mul_f32_e32 v13, v13, v29 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_mul_f32_e32 v12, v12, v28 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 @@ -11410,7 +14619,8 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -11419,12 +14629,41 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -11441,18 +14680,19 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 @@ -11461,11 +14701,12 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -11473,6 +14714,10 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -11481,9 +14726,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v15, v15, v16 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -11492,51 +14734,165 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -11544,7 +14900,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 @@ -11561,52 +14916,149 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 @@ -11625,119 +15077,297 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX10-NEXT: v_mul_f32_e32 v14, v19, v18 +; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 +; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX10-NEXT: v_mul_f32_e32 v13, v21, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo +; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18 +; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_mul_f32_e32 v18, v20, v19 +; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX10-NEXT: v_mul_f32_e32 v19, v22, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_mul_f32_e32 v19, v20, v19 -; GFX10-NEXT: v_mul_f32_e32 v20, v22, v21 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20 +; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v16bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_mul_f32_e32 v13, v21, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 -; GFX11-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mul_f32_e32 v17, v18, v17 +; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff +; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11 -; GFX11-NEXT: v_mul_f32_e32 v14, v19, v18 +; GFX11-NEXT: v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 +; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18 +; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_mul_f32_e32 v12, v18, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v18, v19, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_mul_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX11-NEXT: v_dual_mul_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX11-NEXT: v_mul_f32_e32 v19, v22, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v19, v20, v19 :: v_dual_mul_f32 v20, v22, v21 -; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9 +; GFX11-NEXT: v_mul_f32_e32 v9, v22, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <16 x bfloat> %a, %b ret <16 x bfloat> %op @@ -11747,166 +15377,230 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_fmul_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v31, v32, v31 +; GCN-NEXT: v_mul_f32_e32 v31, v31, v32 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v30, v30, v32 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v29, v29, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v29, v29, v32 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v28, v28, v32 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v27, v27, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v27, v27, v32 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v26, v26, v32 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v25, v25, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_mul_f32_e32 v25, v25, v32 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v24, v24, v32 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v23, v23, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_mul_f32_e32 v23, v23, v32 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v22, v22, v32 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v21, v21, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_mul_f32_e32 v21, v21, v32 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v20, v20, v32 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v19, v19, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v19, v19, v32 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v18, v18, v32 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v17, v17, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_mul_f32_e32 v17, v17, v32 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v16, v16, v32 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v15, v15, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_mul_f32_e32 v15, v15, v32 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v14, v14, v32 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v13, v13, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_mul_f32_e32 v13, v13, v32 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v12, v12, v32 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v11, v11, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v11, v11, v32 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v10, v10, v32 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v9, v9, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v9, v9, v32 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v8, v8, v32 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v7, v7, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v7, v7, v32 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v6, v6, v32 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v5, v5, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v5, v5, v32 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v4, v4, v32 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v3, v3, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_mul_f32_e32 v3, v3, v32 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v2, v2, v32 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GCN-NEXT: v_mul_f32_e32 v1, v1, v33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v1, v1, v32 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -11946,197 +15640,261 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-LABEL: v_fmul_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -12147,114 +15905,329 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31 -; GFX8-NEXT: v_mul_f32_e32 v30, v14, v30 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 +; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_mul_f32_e32 v14, v32, v14 ; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30 +; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_mul_f32_e32 v29, v32, v29 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 ; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 +; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28 +; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_mul_f32_e32 v28, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27 +; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_mul_f32_e32 v27, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26 +; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_mul_f32_e32 v26, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 +; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25 +; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX8-NEXT: v_mul_f32_e32 v25, v32, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX8-NEXT: v_mul_f32_e32 v32, v32, v33 -; GFX8-NEXT: v_mul_f32_e32 v15, v15, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 +; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -12263,8 +16236,13 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -12273,8 +16251,13 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v32bf16: @@ -12282,110 +16265,296 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 +; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_mul_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 +; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_mul_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_mul_f32_e32 v29, v32, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_mul_f32_e32 v28, v32, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_mul_f32_e32 v27, v32, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_mul_f32_e32 v26, v32, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX9-NEXT: v_mul_f32_e32 v25, v32, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_mul_f32_e32 v32, v32, v33 -; GFX9-NEXT: v_mul_f32_e32 v15, v15, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 @@ -12394,7 +16563,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4 +; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v32bf16: @@ -12409,32 +16585,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -12453,29 +16607,28 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53 ; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 @@ -12484,36 +16637,220 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65 ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_mul_f32_e32 v29, v29, v36 -; GFX10-NEXT: v_mul_f32_e32 v28, v28, v38 -; GFX10-NEXT: v_mul_f32_e32 v27, v27, v48 -; GFX10-NEXT: v_mul_f32_e32 v26, v26, v50 +; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 +; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 +; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 +; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 +; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 +; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38 +; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 +; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 +; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 +; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 +; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302 +; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 +; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 +; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 +; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 +; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 +; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 +; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 +; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 +; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 +; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 +; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 +; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 +; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 +; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 +; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 +; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 +; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 +; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 +; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 +; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff +; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 +; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 +; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff +; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 +; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 +; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 +; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 +; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 +; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff +; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 +; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 +; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 +; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff +; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX10-NEXT: v_mul_f32_e32 v16, v32, v16 -; GFX10-NEXT: v_mul_f32_e32 v15, v15, v17 -; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX10-NEXT: v_mul_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18 +; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 +; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v32bf16: @@ -12524,102 +16861,269 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_dual_mul_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11-NEXT: v_mul_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 +; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_mul_f32 v2, v2, v18 :: v_dual_mul_f32 v3, v3, v19 -; GFX11-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51 -; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX11-NEXT: v_mul_f32_e32 v25, v54, v53 -; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_mul_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_mul_f32_e32 v18, v84, v83 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71 +; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_mul_f32_e32 v25, v54, v53 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX11-NEXT: v_mul_f32_e32 v23, v66, v65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_mul_f32 v26, v52, v51 +; GFX11-NEXT: v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_mul_f32_e32 v37, v86, v85 -; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33 -; GFX11-NEXT: v_dual_mul_f32 v34, v80, v71 :: v_dual_mul_f32 v35, v82, v81 -; GFX11-NEXT: v_mul_f32_e32 v36, v84, v83 -; GFX11-NEXT: v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v15, v15, v17 -; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302 -; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302 -; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 +; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff +; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff +; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 -; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX11-NEXT: v_mul_f32_e32 v17, v32, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18 +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <32 x bfloat> %a, %b ret <32 x bfloat> %op @@ -12629,6 +17133,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fdiv_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -12648,6 +17154,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fdiv_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -12680,6 +17188,13 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -12690,6 +17205,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_rcp_f32_e32 v4, v2 ; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0 ; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -12699,6 +17215,12 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12708,16 +17230,22 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -12726,6 +17254,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 @@ -12733,16 +17262,23 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fdiv bfloat %a, %b @@ -12755,16 +17291,18 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN-LABEL: v_fabs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -12798,12 +17336,16 @@ define bfloat @v_fabs_bf16(bfloat %a) { define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fabs_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fabs_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fabs_bf16: @@ -12889,14 +17431,18 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: s_xor_b32 s0, s0, 0x8000 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: s_xor_b32 s0, s0, 0x8000 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_bf16: @@ -12940,20 +17486,22 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-LABEL: v_fneg_fabs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fneg_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -12989,14 +17537,18 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_fabs_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: s_bitset1_b32 s0, 15 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_fabs_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: s_bitset1_b32 s0, 15 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_fabs_bf16: @@ -13049,6 +17601,8 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_minnum_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -13060,6 +17614,8 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_minnum_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -13074,6 +17630,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -13083,6 +17646,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -13091,7 +17661,13 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -13100,8 +17676,16 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b) @@ -13112,6 +17696,10 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_minnum_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -13129,6 +17717,10 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -13148,10 +17740,24 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -13161,10 +17767,23 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13176,9 +17795,20 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v2bf16: @@ -13188,11 +17818,24 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %op @@ -13202,6 +17845,12 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_minnum_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -13225,6 +17874,12 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -13251,12 +17906,34 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -13268,12 +17945,31 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -13282,16 +17978,32 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-LABEL: v_minnum_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) @@ -13302,6 +18014,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_minnum_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -13331,6 +18051,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -13362,17 +18090,46 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 @@ -13383,16 +18140,41 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 @@ -13404,17 +18186,38 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v4bf16: @@ -13422,19 +18225,45 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op @@ -13444,6 +18273,22 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_minnum_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -13497,6 +18342,22 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -13552,31 +18413,88 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX8-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX8-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX8-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v9, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 @@ -13589,28 +18507,77 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -13623,65 +18590,157 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v9, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 +; GFX10-NEXT: v_min_f32_e32 v9, v11, v9 +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; GFX10-NEXT: v_min_f32_e32 v6, v11, v6 +; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_min_f32_e32 v10, v11, v10 -; GFX10-NEXT: v_min_f32_e32 v11, v13, v12 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 +; GFX10-NEXT: v_min_f32_e32 v5, v15, v13 +; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v11, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_min_f32_e32 v9, v11, v9 +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 ; GFX11-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_min_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_dual_min_f32 v10, v11, v10 :: v_dual_min_f32 v11, v13, v12 +; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_min_f32_e32 v5, v15, v13 +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op @@ -13691,56 +18750,87 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_minnum_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 @@ -13769,6 +18859,8 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -13783,8 +18875,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -13794,12 +18885,43 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -13810,13 +18932,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -13839,17 +18961,21 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 @@ -13863,9 +18989,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 @@ -13879,10 +19006,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -13899,51 +19022,165 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX8-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX8-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX8-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX8-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX8-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX8-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX8-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX8-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -13951,7 +19188,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_min_f32_e32 v9, v17, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 @@ -13968,52 +19204,149 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 @@ -14032,119 +19365,297 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_min_f32_e32 v16, v17, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX10-NEXT: v_min_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX10-NEXT: v_min_f32_e32 v14, v19, v18 +; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 +; GFX10-NEXT: v_min_f32_e32 v17, v20, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX10-NEXT: v_min_f32_e32 v13, v21, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo +; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v13, v19, v18 +; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v18, v20, v19 +; GFX10-NEXT: v_min_f32_e32 v12, v18, v12 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v18, v19, v18 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX10-NEXT: v_min_f32_e32 v19, v22, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_min_f32_e32 v19, v20, v19 -; GFX10-NEXT: v_min_f32_e32 v20, v22, v21 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX10-NEXT: v_min_f32_e32 v9, v22, v20 +; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v16bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_min_f32_e32 v13, v21, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX11-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 -; GFX11-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_min_f32_e32 v17, v18, v17 +; GFX11-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff +; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11 -; GFX11-NEXT: v_min_f32_e32 v14, v19, v18 +; GFX11-NEXT: v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 +; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18 +; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_min_f32_e32 v12, v18, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v18, v19, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_min_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX11-NEXT: v_dual_min_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX11-NEXT: v_min_f32_e32 v19, v22, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v19, v20, v19 :: v_dual_min_f32 v20, v22, v21 -; GFX11-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 +; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9 +; GFX11-NEXT: v_min_f32_e32 v9, v22, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) ret <16 x bfloat> %op @@ -14154,230 +19665,294 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_minnum_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_min_f32_e32 v31, v32, v31 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_min_f32_e32 v31, v31, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v30, v30, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v29, v29, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v28, v28, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v27, v27, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v26, v26, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v25, v25, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v23, v23, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v22, v22, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v21, v21, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v20, v20, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v18, v18, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v17, v17, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v16, v16, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v15, v15, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v14, v14, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v13, v13, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v12, v12, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v11, v11, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v10, v10, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v9, v9, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v8, v8, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v7, v7, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v6, v6, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v5, v5, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v4, v4, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v3, v3, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v2, v2, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v1, v1, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -14417,260 +19992,324 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 @@ -14682,114 +20321,329 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX8-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX8-NEXT: v_min_f32_e32 v30, v14, v30 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 +; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX8-NEXT: v_min_f32_e32 v32, v32, v30 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_min_f32_e32 v14, v32, v14 ; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX8-NEXT: v_min_f32_e32 v33, v33, v34 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX8-NEXT: v_min_f32_e32 v30, v15, v30 +; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX8-NEXT: v_min_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_min_f32_e32 v29, v32, v29 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 +; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX8-NEXT: v_min_f32_e32 v28, v33, v28 +; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_min_f32_e32 v28, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX8-NEXT: v_min_f32_e32 v27, v33, v27 +; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_min_f32_e32 v27, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX8-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_min_f32_e32 v26, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 +; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX8-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX8-NEXT: v_min_f32_e32 v25, v32, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX8-NEXT: v_min_f32_e32 v32, v32, v33 -; GFX8-NEXT: v_min_f32_e32 v15, v15, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX8-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX8-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 +; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX8-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX8-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX8-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX8-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX8-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX8-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -14798,8 +20652,13 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -14808,8 +20667,13 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v32bf16: @@ -14817,110 +20681,296 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 +; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_min_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 +; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_min_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_min_f32_e32 v29, v32, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_min_f32_e32 v28, v32, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_min_f32_e32 v27, v32, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_min_f32_e32 v26, v32, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX9-NEXT: v_min_f32_e32 v25, v32, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_min_f32_e32 v32, v32, v33 -; GFX9-NEXT: v_min_f32_e32 v15, v15, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 @@ -14929,7 +20979,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4 +; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v32bf16: @@ -14944,32 +21001,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -14988,6 +21023,68 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 +; GFX10-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX10-NEXT: v_min_f32_e32 v25, v54, v53 +; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX10-NEXT: v_min_f32_e32 v24, v64, v55 +; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX10-NEXT: v_min_f32_e32 v23, v66, v65 +; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX10-NEXT: v_min_f32_e32 v22, v68, v67 +; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 +; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 +; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_min_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX10-NEXT: v_min_f32_e32 v17, v26, v50 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 +; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 +; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 +; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_min_f32_e32 v33, v34, v33 @@ -14996,59 +21093,180 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 -; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 -; GFX10-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX10-NEXT: v_min_f32_e32 v19, v28, v38 +; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 +; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 +; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 +; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 +; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 ; GFX10-NEXT: v_min_f32_e32 v51, v52, v51 -; GFX10-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX10-NEXT: v_min_f32_e32 v25, v54, v53 -; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX10-NEXT: v_min_f32_e32 v24, v64, v55 -; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX10-NEXT: v_min_f32_e32 v23, v66, v65 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX10-NEXT: v_min_f32_e32 v22, v68, v67 ; GFX10-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX10-NEXT: v_min_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_min_f32_e32 v29, v29, v36 -; GFX10-NEXT: v_min_f32_e32 v28, v28, v38 -; GFX10-NEXT: v_min_f32_e32 v27, v27, v48 -; GFX10-NEXT: v_min_f32_e32 v26, v26, v50 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v19 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302 +; GFX10-NEXT: v_min_f32_e32 v20, v29, v36 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 +; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 +; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 +; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 +; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 +; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 +; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 +; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 +; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 +; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 +; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 +; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 +; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 +; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 +; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 +; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 +; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 +; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 +; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 +; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff +; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 +; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 +; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff +; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 +; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 +; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 +; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 +; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 +; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff +; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 +; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 +; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 +; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff +; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX10-NEXT: v_min_f32_e32 v16, v32, v16 -; GFX10-NEXT: v_min_f32_e32 v15, v15, v17 -; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX10-NEXT: v_min_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_min_f32_e32 v15, v15, v18 +; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 +; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v32bf16: @@ -15059,102 +21277,269 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_dual_min_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11-NEXT: v_min_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 +; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_min_f32 v2, v2, v18 :: v_dual_min_f32 v3, v3, v19 -; GFX11-NEXT: v_dual_min_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX11-NEXT: v_min_f32_e32 v26, v52, v51 -; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX11-NEXT: v_min_f32_e32 v25, v54, v53 -; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_min_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_min_f32_e32 v18, v84, v83 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11-NEXT: v_dual_min_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_min_f32_e32 v20, v80, v71 +; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_min_f32_e32 v25, v54, v53 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX11-NEXT: v_min_f32_e32 v23, v66, v65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_min_f32_e32 v28, v48, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_min_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_min_f32 v26, v52, v51 +; GFX11-NEXT: v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_min_f32_e32 v37, v86, v85 -; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33 -; GFX11-NEXT: v_dual_min_f32 v34, v80, v71 :: v_dual_min_f32 v35, v82, v81 -; GFX11-NEXT: v_min_f32_e32 v36, v84, v83 -; GFX11-NEXT: v_dual_min_f32 v16, v32, v16 :: v_dual_min_f32 v15, v15, v17 -; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302 -; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302 -; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 +; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff +; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff +; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 -; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX11-NEXT: v_min_f32_e32 v17, v32, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v15, v15, v18 +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) ret <32 x bfloat> %op @@ -15173,6 +21558,8 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_maxnum_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -15184,6 +21571,8 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_maxnum_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -15198,6 +21587,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -15207,6 +21603,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15215,7 +21618,13 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -15224,8 +21633,16 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) @@ -15236,6 +21653,10 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -15253,6 +21674,10 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -15272,10 +21697,24 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -15285,10 +21724,23 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15300,9 +21752,20 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v2bf16: @@ -15312,11 +21775,24 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %op @@ -15326,6 +21802,12 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -15349,6 +21831,12 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -15375,12 +21863,34 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -15392,12 +21902,31 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -15406,16 +21935,32 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-LABEL: v_maxnum_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) @@ -15426,6 +21971,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -15455,6 +22008,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -15486,17 +22047,46 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX8-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 @@ -15507,16 +22097,41 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 @@ -15528,17 +22143,38 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v4bf16: @@ -15546,19 +22182,45 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op @@ -15568,6 +22230,22 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -15621,6 +22299,22 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -15676,31 +22370,88 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX8-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX8-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX8-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v9, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 @@ -15713,28 +22464,77 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -15747,65 +22547,157 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v9, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 +; GFX10-NEXT: v_max_f32_e32 v9, v11, v9 +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; GFX10-NEXT: v_max_f32_e32 v6, v11, v6 +; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_max_f32_e32 v10, v11, v10 -; GFX10-NEXT: v_max_f32_e32 v11, v13, v12 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 +; GFX10-NEXT: v_max_f32_e32 v5, v15, v13 +; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v11, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_max_f32_e32 v9, v11, v9 +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_max_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_dual_max_f32 v10, v11, v10 :: v_dual_max_f32 v11, v13, v12 +; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff +; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v9, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_max_f32_e32 v5, v15, v13 +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op @@ -15815,56 +22707,87 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 @@ -15893,6 +22816,8 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -15907,8 +22832,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -15918,12 +22842,43 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 @@ -15934,13 +22889,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -15963,17 +22918,21 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 @@ -15987,9 +22946,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 @@ -16003,10 +22963,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -16023,51 +22979,165 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX8-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX8-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX8-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX8-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX8-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX8-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX8-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX8-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 @@ -16075,7 +23145,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_max_f32_e32 v9, v17, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 @@ -16092,52 +23161,149 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 @@ -16156,119 +23322,297 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_max_f32_e32 v16, v17, v16 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX10-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX10-NEXT: v_max_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX10-NEXT: v_max_f32_e32 v14, v19, v18 +; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 +; GFX10-NEXT: v_max_f32_e32 v17, v20, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX10-NEXT: v_max_f32_e32 v13, v21, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo +; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v17, v18, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v13, v19, v18 +; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v18, v20, v19 +; GFX10-NEXT: v_max_f32_e32 v12, v18, v12 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v18, v19, v18 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX10-NEXT: v_max_f32_e32 v19, v22, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_max_f32_e32 v19, v20, v19 -; GFX10-NEXT: v_max_f32_e32 v20, v22, v21 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX10-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX10-NEXT: v_max_f32_e32 v9, v22, v20 +; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff ; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v16bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_max_f32_e32 v13, v21, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 -; GFX11-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 -; GFX11-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_max_f32_e32 v17, v18, v17 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff +; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11 -; GFX11-NEXT: v_max_f32_e32 v14, v19, v18 +; GFX11-NEXT: v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 +; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18 +; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff +; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_max_f32_e32 v12, v18, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff +; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v18, v19, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_max_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_perm_b32 v5, v5, v14, 0x7060302 -; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX11-NEXT: v_dual_max_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff +; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff +; GFX11-NEXT: v_max_f32_e32 v19, v22, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v19, v20, v19 :: v_dual_max_f32 v20, v22, v21 -; GFX11-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9 +; GFX11-NEXT: v_max_f32_e32 v9, v22, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo +; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) ret <16 x bfloat> %op @@ -16278,230 +23622,294 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-LABEL: v_maxnum_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_max_f32_e32 v31, v32, v31 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_max_f32_e32 v31, v31, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v30, v30, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v29, v29, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v28, v28, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v27, v27, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v26, v26, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v25, v25, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v23, v23, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v22, v22, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v21, v21, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v20, v20, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v18, v18, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v17, v17, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v16, v16, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v15, v15, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v14, v14, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v13, v13, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v12, v12, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v11, v11, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v10, v10, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v9, v9, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v8, v8, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v7, v7, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v6, v6, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v5, v5, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v4, v4, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v3, v3, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v2, v2, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v1, v1, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -16541,260 +23949,324 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 @@ -16806,114 +24278,329 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX8-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX8-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX8-NEXT: v_max_f32_e32 v30, v14, v30 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 +; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX8-NEXT: v_max_f32_e32 v32, v32, v30 +; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX8-NEXT: v_max_f32_e32 v14, v32, v14 ; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX8-NEXT: v_max_f32_e32 v33, v33, v34 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX8-NEXT: v_max_f32_e32 v30, v15, v30 +; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX8-NEXT: v_max_f32_e32 v29, v33, v29 +; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX8-NEXT: v_max_f32_e32 v29, v32, v29 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 +; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX8-NEXT: v_max_f32_e32 v28, v33, v28 +; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX8-NEXT: v_max_f32_e32 v28, v32, v28 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX8-NEXT: v_max_f32_e32 v27, v33, v27 +; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX8-NEXT: v_max_f32_e32 v27, v32, v27 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX8-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX8-NEXT: v_max_f32_e32 v26, v32, v26 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 +; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX8-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX8-NEXT: v_max_f32_e32 v25, v32, v25 -; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX8-NEXT: v_max_f32_e32 v32, v32, v33 -; GFX8-NEXT: v_max_f32_e32 v15, v15, v24 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX8-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX8-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX8-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 +; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX8-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX8-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 +; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX8-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX8-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX8-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX8-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX8-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -16922,8 +24609,13 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 @@ -16932,8 +24624,13 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v14, v16, v31, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v32bf16: @@ -16941,110 +24638,296 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 +; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_max_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 +; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_max_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_max_f32_e32 v29, v32, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_max_f32_e32 v28, v32, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_max_f32_e32 v27, v32, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_max_f32_e32 v26, v32, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GFX9-NEXT: v_max_f32_e32 v25, v32, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v29, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_max_f32_e32 v32, v32, v33 -; GFX9-NEXT: v_max_f32_e32 v15, v15, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 ; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 @@ -17053,7 +24936,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 ; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 ; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s4 +; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v32bf16: @@ -17068,32 +24958,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 ; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 -; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 -; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 @@ -17112,29 +24980,28 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 -; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 -; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 +; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 +; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX10-NEXT: v_max_f32_e32 v37, v38, v37 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18 ; GFX10-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX10-NEXT: v_max_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX10-NEXT: v_max_f32_e32 v25, v54, v53 ; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 @@ -17143,36 +25010,220 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_max_f32_e32 v23, v66, v65 ; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX10-NEXT: v_max_f32_e32 v22, v68, v67 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX10-NEXT: v_max_f32_e32 v21, v30, v34 -; GFX10-NEXT: v_max_f32_e32 v29, v29, v36 -; GFX10-NEXT: v_max_f32_e32 v28, v28, v38 -; GFX10-NEXT: v_max_f32_e32 v27, v27, v48 -; GFX10-NEXT: v_max_f32_e32 v26, v26, v50 +; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1 +; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 +; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX10-NEXT: v_max_f32_e32 v18, v27, v48 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX10-NEXT: v_max_f32_e32 v17, v26, v50 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 +; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 +; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49 +; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10 +; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX10-NEXT: v_max_f32_e32 v19, v28, v38 +; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9 +; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10 +; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11 +; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12 +; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v51, v52, v51 +; GFX10-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX10-NEXT: v_max_f32_e32 v21, v30, v34 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX10-NEXT: v_perm_b32 v1, v1, v27, 0x7060302 -; GFX10-NEXT: v_perm_b32 v0, v0, v26, 0x7060302 -; GFX10-NEXT: v_perm_b32 v2, v2, v28, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v3, v29, 0x7060302 +; GFX10-NEXT: v_max_f32_e32 v20, v29, v36 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 +; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 +; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 +; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 +; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 +; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff +; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 +; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 +; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff +; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 +; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14 +; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35 +; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13 +; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7 +; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8 +; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 +; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 +; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 +; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff +; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24 +; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo +; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4 +; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5 +; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6 +; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 +; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff +; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 +; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 +; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff +; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 +; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 +; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 +; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff +; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 +; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff +; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 +; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 +; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff +; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 +; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 +; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 +; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff +; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 +; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff +; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7 +; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 +; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff +; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21 ; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 ; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302 ; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302 ; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302 -; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x7060302 -; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 +; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX10-NEXT: v_max_f32_e32 v16, v32, v16 -; GFX10-NEXT: v_max_f32_e32 v15, v15, v17 -; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX10-NEXT: v_max_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_max_f32_e32 v15, v15, v18 +; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 +; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 +; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff +; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4 +; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v32bf16: @@ -17183,102 +25234,269 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 ; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 ; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 ; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_dual_max_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 +; GFX11-NEXT: v_max_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11 ; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 +; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 +; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 -; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_max_f32 v2, v2, v18 :: v_dual_max_f32 v3, v3, v19 -; GFX11-NEXT: v_dual_max_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX11-NEXT: v_max_f32_e32 v26, v52, v51 -; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 -; GFX11-NEXT: v_max_f32_e32 v25, v54, v53 -; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 -; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_max_f32_e32 v24, v64, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_max_f32_e32 v18, v84, v83 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; GFX11-NEXT: v_dual_max_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_max_f32_e32 v20, v80, v71 +; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_max_f32_e32 v25, v54, v53 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX11-NEXT: v_max_f32_e32 v23, v66, v65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_max_f32_e32 v28, v48, v39 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_max_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_max_f32 v26, v52, v51 +; GFX11-NEXT: v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_max_f32_e32 v37, v86, v85 -; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33 -; GFX11-NEXT: v_dual_max_f32 v34, v80, v71 :: v_dual_max_f32 v35, v82, v81 -; GFX11-NEXT: v_max_f32_e32 v36, v84, v83 -; GFX11-NEXT: v_dual_max_f32 v16, v32, v16 :: v_dual_max_f32 v15, v15, v17 -; GFX11-NEXT: v_perm_b32 v0, v0, v37, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v2, v2, v35, 0x7060302 -; GFX11-NEXT: v_perm_b32 v1, v1, v36, 0x7060302 -; GFX11-NEXT: v_perm_b32 v3, v3, v34, 0x7060302 -; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 -; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 +; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff +; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 +; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff +; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff +; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 +; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff +; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 +; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff +; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff +; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff ; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 ; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 -; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 +; GFX11-NEXT: v_max_f32_e32 v17, v32, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v15, v15, v18 +; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) ret <32 x bfloat> %op @@ -17290,9 +25508,10 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GCN-LABEL: v_sqrt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0xf800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x260 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -17315,6 +25534,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX7-LABEL: v_sqrt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xf800000 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 @@ -17359,6 +25579,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17384,6 +25611,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17403,10 +25637,16 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4 ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17430,14 +25670,21 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 ; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.sqrt.bf16(bfloat %a) @@ -17450,6 +25697,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GCN-LABEL: v_ldexp_bf16_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17458,6 +25706,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX7-LABEL: v_ldexp_bf16_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17468,6 +25717,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17476,6 +25732,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17483,7 +25746,13 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17491,8 +25760,16 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b) @@ -17505,12 +25782,14 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GCN-LABEL: v_frexp_bf16_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x7f800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0 ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -17518,6 +25797,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX7-LABEL: v_frexp_bf16_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 ; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 @@ -17527,10 +25807,17 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX8-LABEL: v_frexp_bf16_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 -; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_frexp_bf16_i16: @@ -17538,6 +25825,13 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17546,8 +25840,14 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v3, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a) @@ -17563,11 +25863,12 @@ define bfloat @v_log_bf16(bfloat %a) { ; GCN-LABEL: v_log_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000 ; GCN-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -17591,6 +25892,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX7-LABEL: v_log_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000 @@ -17638,6 +25940,13 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17663,6 +25972,13 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17670,18 +25986,24 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x7f800000, |v0| ; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo +; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17689,24 +26011,31 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| ; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo +; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log.bf16(bfloat %a) @@ -17717,10 +26046,11 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GCN-LABEL: v_log2_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -17733,6 +26063,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX7-LABEL: v_log2_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000 @@ -17759,6 +26090,13 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17774,7 +26112,14 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17782,12 +26127,18 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17795,6 +26146,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo @@ -17804,6 +26156,13 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log2.bf16(bfloat %a) @@ -17814,11 +26173,12 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GCN-LABEL: v_log10_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000 ; GCN-NEXT: v_mov_b32_e32 v2, 0x411a209b +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -17842,6 +26202,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX7-LABEL: v_log10_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000 @@ -17889,6 +26250,13 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, 0x411a209b ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -17914,6 +26282,13 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17921,18 +26296,24 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x7f800000, |v0| ; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo +; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -17940,24 +26321,31 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| ; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo +; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log10.bf16(bfloat %a) @@ -17972,10 +26360,11 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GCN-LABEL: v_exp_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0 ; GCN-NEXT: s_mov_b32 s5, 0x42b17218 ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 @@ -17999,6 +26388,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX7-LABEL: v_exp_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 @@ -18045,6 +26435,13 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18070,6 +26467,13 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18077,6 +26481,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1 @@ -18090,6 +26495,11 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18097,6 +26507,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1 @@ -18115,7 +26526,13 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.exp.bf16(bfloat %a) @@ -18126,10 +26543,11 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GCN-LABEL: v_exp2_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GCN-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 @@ -18142,6 +26560,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX7-LABEL: v_exp2_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000 @@ -18168,6 +26587,13 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18183,7 +26609,14 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18191,12 +26624,18 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18204,6 +26643,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo @@ -18213,6 +26653,13 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.exp2.bf16(bfloat %a) @@ -18223,10 +26670,11 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GCN-LABEL: v_exp10_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc23369f4 ; GCN-NEXT: s_mov_b32 s5, 0x421a209b ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 @@ -18250,6 +26698,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX7-LABEL: v_exp10_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x40549a78 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 @@ -18296,6 +26745,13 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18321,6 +26777,13 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18328,6 +26791,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1 @@ -18341,6 +26805,11 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18348,6 +26817,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1 @@ -18366,7 +26836,13 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.exp10.bf16(bfloat %a) @@ -18379,6 +26855,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GCN-LABEL: v_ceil_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_ceil_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18387,6 +26864,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX7-LABEL: v_ceil_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_ceil_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18397,6 +26875,13 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_ceil_f32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18405,6 +26890,13 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_ceil_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18412,7 +26904,13 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_ceil_f32_e32 v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18420,8 +26918,16 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ceil_f32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.ceil.bf16(bfloat %a) @@ -18434,6 +26940,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GCN-LABEL: v_trunc_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18442,6 +26949,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX7-LABEL: v_trunc_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18452,6 +26960,13 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_trunc_f32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18460,6 +26975,13 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_trunc_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18467,7 +26989,13 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18475,8 +27003,16 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.trunc.bf16(bfloat %a) @@ -18489,6 +27025,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GCN-LABEL: v_rint_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_rndne_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18497,6 +27034,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX7-LABEL: v_rint_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18507,6 +27045,13 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18515,6 +27060,13 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18522,7 +27074,13 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18530,8 +27088,16 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.rint.bf16(bfloat %a) @@ -18544,6 +27110,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GCN-LABEL: v_nearbyint_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_rndne_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18552,6 +27119,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX7-LABEL: v_nearbyint_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18562,6 +27130,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18570,6 +27145,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18577,7 +27159,13 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18585,8 +27173,16 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.nearbyint.bf16(bfloat %a) @@ -18599,6 +27195,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GCN-LABEL: v_round_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v0 ; GCN-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -18613,6 +27210,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX7-LABEL: v_round_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_trunc_f32_e32 v1, v0 ; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -18635,6 +27233,13 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18649,6 +27254,13 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18660,8 +27272,14 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18675,10 +27293,17 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.round.bf16(bfloat %a) @@ -18691,6 +27316,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GCN-LABEL: v_roundeven_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_rndne_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18699,6 +27325,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX7-LABEL: v_roundeven_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18709,6 +27336,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18717,6 +27351,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18724,7 +27365,13 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18732,8 +27379,16 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.roundeven.bf16(bfloat %a) @@ -18746,6 +27401,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GCN-LABEL: v_floor_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_floor_f32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18754,6 +27410,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX7-LABEL: v_floor_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_floor_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -18764,6 +27421,13 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_floor_f32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18772,6 +27436,13 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_floor_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18779,7 +27450,13 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_floor_f32_e32 v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18787,8 +27464,16 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_floor_f32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.floor.bf16(bfloat %a) @@ -18813,6 +27498,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -18821,6 +27513,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -18828,7 +27527,13 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -18836,8 +27541,16 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.canonicalize.bf16(bfloat %a) @@ -18896,6 +27609,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_oeq_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 @@ -18905,6 +27620,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_oeq_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 @@ -18955,6 +27672,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ogt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 @@ -18964,6 +27683,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ogt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 @@ -19014,6 +27735,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_oge_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 @@ -19023,6 +27746,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_oge_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 @@ -19073,6 +27798,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_olt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 @@ -19082,6 +27809,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_olt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 @@ -19132,6 +27861,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ole_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 @@ -19141,6 +27872,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ole_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 @@ -19191,6 +27924,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_one_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 @@ -19200,6 +27935,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_one_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 @@ -19250,6 +27987,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_uno_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 @@ -19259,6 +27998,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_uno_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 @@ -19309,6 +28050,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ueq_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 @@ -19318,6 +28061,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ueq_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 @@ -19368,6 +28113,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ugt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 @@ -19377,6 +28124,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ugt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 @@ -19427,6 +28176,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_uge_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 @@ -19436,6 +28187,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_uge_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 @@ -19486,6 +28239,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ult_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 @@ -19495,6 +28250,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ult_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 @@ -19545,6 +28302,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ule_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 @@ -19554,6 +28313,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ule_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 @@ -19604,6 +28365,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_une_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 @@ -19613,6 +28376,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_une_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 @@ -19705,6 +28470,7 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19715,6 +28481,7 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { ; GFX7-LABEL: v_copysign_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19762,6 +28529,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GCN-LABEL: v_copysign_bf16_s_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_and_b32 s4, s4, 0x80000000 ; GCN-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19772,6 +28540,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX7-LABEL: v_copysign_bf16_s_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_and_b32 s4, s4, 0x80000000 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19821,20 +28590,22 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_s_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_bfe_u32 s4, s4, 0xf0010 -; GCN-NEXT: v_or_b32_e32 v0, s4, v0 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_s_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0xf0010 -; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -19880,6 +28651,7 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { ; GCN-LABEL: v_copysign_bf16_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19890,6 +28662,7 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { ; GFX7-LABEL: v_copysign_bf16_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19939,6 +28712,7 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { ; GCN-LABEL: v_copysign_bf16_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19949,6 +28723,7 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { ; GFX7-LABEL: v_copysign_bf16_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -19998,6 +28773,7 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { ; GCN-LABEL: v_copysign_bf16_f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -20009,6 +28785,7 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -20055,18 +28832,22 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) { ; GCN-LABEL: s_copysign_bf16_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s1, s1, 0x80000000 -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s1, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_bf16: @@ -20120,18 +28901,22 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) { ; GCN-LABEL: s_copysign_bf16_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s1, s1, 0x80000000 -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s1, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f32: @@ -20189,18 +28974,22 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) { ; GCN-LABEL: s_copysign_bf16_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s1, s2, 0x80000000 -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s1, s2, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f64: @@ -20258,19 +29047,21 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) { ; GCN-LABEL: s_copysign_bf16_f16: ; GCN: ; %bb.0: -; GCN-NEXT: v_cvt_f16_f32_e32 v0, s1 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: s_bfe_u32 s0, s0, 0xf0010 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0xf0010 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -20439,8 +29230,9 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { ; GCN-LABEL: v_copysign_f16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-NEXT: s_brev_b32 s4, -2 @@ -20451,10 +29243,11 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: s_brev_b32 s4, -2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_brev_b32 s4, -2 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -20491,12 +29284,13 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) { ; GCN-LABEL: s_copysign_f16_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, s0 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s1 ; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -20504,10 +29298,11 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf ; GFX7-LABEL: s_copysign_f16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX7-NEXT: s_brev_b32 s0, -2 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_brev_b32 s0, -2 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -20677,6 +29472,7 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GCN-LABEL: v_fptosi_bf16_to_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -20684,6 +29480,7 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GFX7-LABEL: v_fptosi_bf16_to_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -20724,6 +29521,8 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v2bf16_to_v2i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -20737,6 +29536,8 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -20797,6 +29598,9 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v3bf16_to_v3i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -20813,6 +29617,9 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -20884,6 +29691,10 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v4bf16_to_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -20905,6 +29716,10 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -20994,6 +29809,7 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GCN-LABEL: v_fptosi_bf16_to_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -21001,6 +29817,7 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GFX7-LABEL: v_fptosi_bf16_to_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21041,6 +29858,8 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v2bf16_to_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 @@ -21050,6 +29869,8 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 @@ -21102,6 +29923,9 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v3bf16_to_v3i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -21113,6 +29937,9 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -21178,6 +30005,10 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v4bf16_to_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -21191,6 +30022,10 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -21266,9 +30101,10 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GCN-LABEL: v_fptosi_bf16_to_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 @@ -21285,6 +30121,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX7-LABEL: v_fptosi_bf16_to_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 @@ -21385,9 +30222,11 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v2bf16_to_v2i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -21416,6 +30255,7 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 @@ -21424,13 +30264,14 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 ; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0| ; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4 ; GFX7-NEXT: v_trunc_f32_e32 v3, v1 ; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4 ; GFX7-NEXT: v_floor_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX7-NEXT: v_fma_f32 v5, v1, s5, |v3| ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v1 @@ -21578,9 +30419,12 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v3bf16_to_v3i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 @@ -21621,6 +30465,7 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 @@ -21629,15 +30474,17 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 ; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0| ; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5 ; GFX7-NEXT: v_trunc_f32_e32 v4, v1 ; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4 +; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX7-NEXT: v_floor_f32_e32 v1, v1 ; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4| ; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1 @@ -21843,9 +30690,13 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GCN-LABEL: v_fptosi_v4bf16_to_v4i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -21898,57 +30749,60 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX7-NEXT: v_mul_f32_e64 v4, |v0|, s4 -; GFX7-NEXT: v_floor_f32_e32 v4, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4 +; GFX7-NEXT: v_floor_f32_e32 v3, v3 ; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX7-NEXT: v_fma_f32 v5, v4, s5, |v0| +; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0| ; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6 ; GFX7-NEXT: v_trunc_f32_e32 v5, v1 ; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4 +; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX7-NEXT: v_floor_f32_e32 v1, v1 ; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5| ; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX7-NEXT: v_xor_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1 -; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v4, v6, vcc -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_xor_b32_e32 v6, v7, v4 +; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3 ; GFX7-NEXT: v_trunc_f32_e32 v7, v2 ; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4 ; GFX7-NEXT: v_floor_f32_e32 v2, v2 -; GFX7-NEXT: v_xor_b32_e32 v5, v8, v4 +; GFX7-NEXT: v_xor_b32_e32 v5, v8, v3 ; GFX7-NEXT: v_fma_f32 v8, v2, s5, |v7| -; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v8 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v2 -; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v4 -; GFX7-NEXT: v_trunc_f32_e32 v3, v3 -; GFX7-NEXT: v_subb_u32_e32 v8, vcc, v5, v4, vcc +; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v2 +; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3 +; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GFX7-NEXT: v_mul_f32_e64 v7, |v3|, s4 -; GFX7-NEXT: v_floor_f32_e32 v7, v7 -; GFX7-NEXT: v_xor_b32_e32 v4, v9, v5 -; GFX7-NEXT: v_fma_f32 v9, v7, s5, |v3| +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5 +; GFX7-NEXT: v_trunc_f32_e32 v8, v4 +; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4 +; GFX7-NEXT: v_floor_f32_e32 v4, v4 +; GFX7-NEXT: v_xor_b32_e32 v6, v9, v5 +; GFX7-NEXT: v_fma_f32 v9, v4, s5, |v8| ; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX7-NEXT: v_xor_b32_e32 v6, v10, v5 -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v7, v5 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v8 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GFX7-NEXT: v_xor_b32_e32 v6, v9, v3 -; GFX7-NEXT: v_xor_b32_e32 v7, v7, v3 -; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc -; GFX7-NEXT: v_mov_b32_e32 v3, v8 +; GFX7-NEXT: v_xor_b32_e32 v6, v9, v7 +; GFX7-NEXT: v_xor_b32_e32 v8, v10, v7 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 +; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64: @@ -22198,6 +31052,13 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -22205,6 +31066,13 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -22212,6 +31080,12 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22219,8 +31093,16 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp i16 %x to bfloat @@ -22253,39 +31135,90 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v1, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp <2 x i16> %x to <2 x bfloat> ret <2 x bfloat> %op @@ -22323,32 +31256,89 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v2, v2, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v2, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = sitofp <3 x i16> %x to <3 x bfloat> @@ -22393,55 +31383,150 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v6, v3, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v11, v0, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v1 -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v0 -; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16 +; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_or_b32 v6, v3, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v0, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp <4 x i16> %x to <4 x bfloat> ret <4 x bfloat> %op @@ -22466,6 +31551,13 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -22473,6 +31565,13 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -22480,6 +31579,12 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22487,7 +31592,14 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp i32 %x to bfloat @@ -22516,8 +31628,22 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -22527,6 +31653,19 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -22536,6 +31675,17 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22544,6 +31694,19 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -22578,19 +31741,60 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 @@ -22601,8 +31805,24 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = sitofp <3 x i32> %x to <3 x bfloat> @@ -22639,10 +31859,39 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v3 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 @@ -22653,9 +31902,34 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 @@ -22666,9 +31940,30 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v9, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_and_or_b32 v11, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v4, v10, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v3, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v7, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22677,10 +31972,32 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_and_or_b32 v9, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_or_b32 v6, v3, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v4, v10, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp <4 x i32> %x to <4 x bfloat> @@ -22739,6 +32056,13 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -22752,11 +32076,18 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -22765,6 +32096,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 @@ -22775,6 +32107,11 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22783,6 +32120,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX11-NEXT: v_cls_i32_e32 v3, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 @@ -22798,6 +32136,13 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp i64 %x to bfloat @@ -22877,23 +32222,38 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4 +; GFX8-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0 ; GFX8-NEXT: v_ffbh_i32_e32 v0, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 -; GFX8-NEXT: v_min_u32_e32 v6, v0, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; GFX8-NEXT: v_min_u32_e32 v7, v0, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v5, v2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v7 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -22908,24 +32268,37 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 ; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v6, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v5 +; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -22936,6 +32309,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v7, v3 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 @@ -22956,6 +32330,16 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -22966,10 +32350,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: v_xor_b32_e32 v5, v2, v3 ; GFX11-NEXT: v_cls_i32_e32 v6, v1 ; GFX11-NEXT: v_cls_i32_e32 v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_nc_u32_e32 v6, -1, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v7, -1, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -22995,6 +32378,18 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -23101,36 +32496,58 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7 ; GFX8-NEXT: v_min_u32_e32 v6, v6, v7 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6 -; GFX8-NEXT: v_ldexp_f32 v6, v4, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX8-NEXT: v_ffbh_i32_e32 v4, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 -; GFX8-NEXT: v_min_u32_e32 v7, v4, v5 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX8-NEXT: v_min_u32_e32 v0, 1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v5, v2, v3 -; GFX8-NEXT: v_ffbh_i32_e32 v4, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX8-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX8-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, v2, v3 +; GFX8-NEXT: v_ffbh_i32_e32 v6, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7 +; GFX8-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX8-NEXT: v_ldexp_f32 v0, v0, v4 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v6 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX8-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -23144,37 +32561,56 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v5 +; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v5 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -23182,44 +32618,60 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v7, v0, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5 +; GFX10-NEXT: v_xor_b32_e32 v8, v0, v1 +; GFX10-NEXT: v_xor_b32_e32 v7, v4, v5 ; GFX10-NEXT: v_xor_b32_e32 v9, v2, v3 -; GFX10-NEXT: v_ffbh_i32_e32 v6, v5 ; GFX10-NEXT: v_ffbh_i32_e32 v10, v1 +; GFX10-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v7 ; GFX10-NEXT: v_ffbh_i32_e32 v11, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v7, 32, v7 ; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8 -; GFX10-NEXT: v_min_u32_e32 v7, v10, v7 -; GFX10-NEXT: v_min_u32_e32 v9, v11, v9 -; GFX10-NEXT: v_min_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] +; GFX10-NEXT: v_min_u32_e32 v8, v10, v8 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX10-NEXT: v_min_u32_e32 v7, v11, v9 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v6 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX10-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v9 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7 +; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3 -; GFX10-NEXT: v_ldexp_f32 v1, v1, v4 -; GFX10-NEXT: v_ldexp_f32 v2, v2, v6 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v6 +; GFX10-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v2, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = sitofp <3 x i64> %x to <3 x bfloat> ret <3 x bfloat> %op @@ -23350,48 +32802,77 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9 ; GFX8-NEXT: v_min_u32_e32 v8, v8, v9 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8 +; GFX8-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_xor_b32_e32 v5, v6, v7 -; GFX8-NEXT: v_cvt_f32_i32_e32 v9, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 -; GFX8-NEXT: v_min_u32_e32 v10, v4, v5 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v8 +; GFX8-NEXT: v_min_u32_e32 v11, v4, v5 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v10 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1 +; GFX8-NEXT: v_ffbh_i32_e32 v8, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX8-NEXT: v_ldexp_f32 v5, v9, v6 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 -; GFX8-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9 +; GFX8-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v11 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX8-NEXT: v_ffbh_i32_e32 v6, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7 -; GFX8-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 +; GFX8-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX8-NEXT: v_cvt_f32_i32_e32 v7, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0 ; GFX8-NEXT: v_ffbh_i32_e32 v0, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 -; GFX8-NEXT: v_min_u32_e32 v8, v0, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 +; GFX8-NEXT: v_min_u32_e32 v9, v0, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v7, v2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v9 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 @@ -23407,50 +32888,75 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 ; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: v_sub_u32_e32 v8, 32, v8 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 ; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v4 +; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 ; GFX9-NEXT: v_ffbh_i32_e32 v4, v7 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v10, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_ldexp_f32 v6, v9, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v11, v4, v5 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v10 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 -; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v9, v0, v1 +; GFX9-NEXT: v_ffbh_i32_e32 v8, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX9-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v10 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v6, 32, v11 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 ; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v5 +; GFX9-NEXT: v_min_u32_e32 v9, v0, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16: @@ -23460,16 +32966,16 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_ffbh_i32_e32 v9, v5 ; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7 ; GFX10-NEXT: v_xor_b32_e32 v13, v0, v1 -; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3 +; GFX10-NEXT: v_ffbh_i32_e32 v10, v7 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9 -; GFX10-NEXT: v_ffbh_i32_e32 v10, v7 ; GFX10-NEXT: v_ffbh_i32_e32 v12, v1 +; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8 -; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10 ; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12 +; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 32, v11 ; GFX10-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v13 @@ -23479,33 +32985,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_min_u32_e32 v9, v12, v9 ; GFX10-NEXT: v_min_u32_e32 v11, v13, v14 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX10-NEXT: v_min_u32_e32 v6, 1, v6 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX10-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11 +; GFX10-NEXT: v_min_u32_e32 v5, 1, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v8 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v10 +; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX10-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v5 +; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v2, v4, v6 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v10 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9 -; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 -; GFX10-NEXT: v_ldexp_f32 v3, v4, v7 +; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5 -; GFX10-NEXT: v_ldexp_f32 v2, v2, v6 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v6 +; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v9, v3, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v7, v8, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16: @@ -23515,16 +33042,16 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_cls_i32_e32 v9, v5 ; GFX11-NEXT: v_xor_b32_e32 v11, v6, v7 ; GFX11-NEXT: v_xor_b32_e32 v13, v0, v1 -; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3 +; GFX11-NEXT: v_cls_i32_e32 v10, v7 ; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GFX11-NEXT: v_add_nc_u32_e32 v9, -1, v9 -; GFX11-NEXT: v_cls_i32_e32 v10, v7 ; GFX11-NEXT: v_cls_i32_e32 v12, v1 +; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v8, 32, v8 -; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12 +; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v8, v9, v8 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v13 @@ -23534,40 +33061,61 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_add_nc_u32_e32 v13, -1, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_u32_e32 v9, v12, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, v13, v14 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v8 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_sub_nc_u32_e32 v7, 32, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v8 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v5 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f32 v2, v4, v6 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v10 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9 -; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 -; GFX11-NEXT: v_ldexp_f32 v3, v4, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5 -; GFX11-NEXT: v_ldexp_f32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v6 +; GFX11-NEXT: v_add3_u32 v4, v7, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v3, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_add3_u32 v4, v6, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v1, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp <4 x i64> %x to <4 x bfloat> ret <4 x bfloat> %op @@ -23594,6 +33142,13 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -23601,6 +33156,13 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23608,6 +33170,12 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -23615,8 +33183,16 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp i16 %x to bfloat @@ -23649,10 +33225,24 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16: @@ -23660,6 +33250,19 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -23669,6 +33272,17 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -23677,9 +33291,22 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v1, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -23719,20 +33346,61 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v2, v2, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -23743,8 +33411,24 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v2, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = uitofp <3 x i16> %x to <3 x bfloat> @@ -23789,23 +33473,77 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 @@ -23816,9 +33554,30 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v9, v3, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v11, v0, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v7, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -23826,17 +33585,42 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v7, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v9, v3, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_and_or_b32 v11, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo ; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp <4 x i16> %x to <4 x bfloat> @@ -23862,6 +33646,13 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -23869,6 +33660,13 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23876,6 +33674,12 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -23883,7 +33687,14 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp i32 %x to bfloat @@ -23912,8 +33723,22 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -23923,6 +33748,19 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -23932,6 +33770,17 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -23940,6 +33789,19 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -23974,19 +33836,60 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 @@ -23997,8 +33900,24 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = uitofp <3 x i32> %x to <3 x bfloat> @@ -24035,10 +33954,39 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v3 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 @@ -24049,9 +33997,34 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 @@ -24062,9 +34035,30 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_and_or_b32 v9, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_and_or_b32 v11, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v4, v10, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v3, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v7, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -24073,10 +34067,32 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_and_or_b32 v9, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v11, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_or_b32 v6, v3, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v4, v10, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp <4 x i32> %x to <4 x bfloat> @@ -24123,6 +34139,13 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -24132,11 +34155,18 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX9-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -24144,6 +34174,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 @@ -24151,6 +34182,11 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -24158,6 +34194,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -24168,7 +34205,13 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp i64 %x to bfloat @@ -24230,17 +34273,31 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4 +; GFX8-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX8-NEXT: v_min_u32_e32 v6, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; GFX8-NEXT: v_min_u32_e32 v7, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v5, v2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v7 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -24251,20 +34308,33 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v4, v1 ; GFX9-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v5 +; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -24273,6 +34343,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -24287,6 +34358,16 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -24295,6 +34376,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 @@ -24315,6 +34397,18 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -24393,28 +34487,50 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_ffbh_u32_e32 v6, v5 ; GFX8-NEXT: v_min_u32_e32 v6, 32, v6 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX8-NEXT: v_ffbh_u32_e32 v7, v1 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6 -; GFX8-NEXT: v_ldexp_f32 v6, v4, v5 -; GFX8-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX8-NEXT: v_min_u32_e32 v7, 32, v4 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX8-NEXT: v_min_u32_e32 v0, 1, v4 -; GFX8-NEXT: v_ffbh_u32_e32 v4, v3 -; GFX8-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX8-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX8-NEXT: v_ffbh_u32_e32 v6, v3 +; GFX8-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v4 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v6 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX8-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -24425,28 +34541,47 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v6, v5 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_u32_e32 v6, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v5 +; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v5 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -24455,31 +34590,47 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v6, v1 -; GFX10-NEXT: v_ffbh_u32_e32 v7, v3 -; GFX10-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX10-NEXT: v_ffbh_u32_e32 v8, v3 +; GFX10-NEXT: v_ffbh_u32_e32 v7, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX10-NEXT: v_min_u32_e32 v7, 32, v7 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX10-NEXT: v_min_u32_e32 v7, 32, v7 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v7 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX10-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3 -; GFX10-NEXT: v_ldexp_f32 v1, v1, v4 -; GFX10-NEXT: v_ldexp_f32 v2, v2, v8 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX10-NEXT: v_ldexp_f32 v2, v2, v4 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v7 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v2, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = uitofp <3 x i64> %x to <3 x bfloat> ret <3 x bfloat> %op @@ -24574,36 +34725,65 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_ffbh_u32_e32 v8, v5 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8 +; GFX8-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4 ; GFX8-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX8-NEXT: v_min_u32_e32 v10, 32, v4 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v8 +; GFX8-NEXT: v_min_u32_e32 v11, 32, v4 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] +; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX8-NEXT: v_ldexp_f32 v5, v9, v6 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_ffbh_u32_e32 v8, v1 +; GFX8-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v11 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX8-NEXT: v_ffbh_u32_e32 v6, v1 -; GFX8-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 +; GFX8-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX8-NEXT: v_min_u32_e32 v8, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 +; GFX8-NEXT: v_min_u32_e32 v9, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v7, v2 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v9 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 @@ -24615,99 +34795,145 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v8, v5 ; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: v_sub_u32_e32 v8, 32, v8 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX9-NEXT: v_min_u32_e32 v10, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 +; GFX9-NEXT: v_min_u32_e32 v11, 32, v4 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_ffbh_u32_e32 v8, v1 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v10 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 32, v11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_ldexp_f32 v6, v9, v8 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v5 +; GFX9-NEXT: v_min_u32_e32 v9, 32, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v8, v5 -; GFX10-NEXT: v_ffbh_u32_e32 v9, v1 -; GFX10-NEXT: v_ffbh_u32_e32 v10, v3 -; GFX10-NEXT: v_ffbh_u32_e32 v11, v7 +; GFX10-NEXT: v_ffbh_u32_e32 v10, v1 +; GFX10-NEXT: v_ffbh_u32_e32 v11, v3 +; GFX10-NEXT: v_ffbh_u32_e32 v9, v7 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX10-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX10-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX10-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX10-NEXT: v_min_u32_e32 v6, 1, v6 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v11 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v10 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v11 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v10 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX10-NEXT: v_ldexp_f32 v4, v4, v8 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX10-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX10-NEXT: v_ldexp_f32 v2, v2, v8 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5 -; GFX10-NEXT: v_ldexp_f32 v2, v2, v6 -; GFX10-NEXT: v_ldexp_f32 v1, v3, v1 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_ldexp_f32 v4, v4, v9 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v3, v9, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v4, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v8, v5 -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v1 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v3 -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7] ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 32, v8 +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 @@ -24715,24 +34941,45 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v11 -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v10 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v4 +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v11 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v10 -; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: v_ldexp_f32 v4, v4, v8 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f32 v2, v2, v6 -; GFX11-NEXT: v_ldexp_f32 v1, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v9 +; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v3, v9, v1, 0x7fff +; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_and_or_b32 v9, v4, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp <4 x i64> %x to <4 x bfloat> ret <4 x bfloat> %op @@ -24742,6 +34989,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GCN-LABEL: v_select_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc @@ -24752,6 +35001,8 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -24797,8 +35048,9 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GCN-LABEL: v_select_fneg_lhs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -24808,7 +35060,8 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -24859,8 +35112,9 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GCN-LABEL: v_select_fneg_rhs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -24870,7 +35124,8 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -24921,11 +35176,15 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GCN-LABEL: v_select_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc @@ -24936,6 +35195,10 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX7-LABEL: v_select_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -25006,7 +35269,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GCN-LABEL: v_vselect_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -25021,7 +35288,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc @@ -25094,20 +35365,20 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GCN-LABEL: s_select_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -25162,17 +35433,17 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -25180,17 +35451,17 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX7-LABEL: s_select_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: s_lshr_b32 s3, s3, 16 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -25265,14 +35536,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) { ; GCN-LABEL: s_vselect_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 @@ -25281,14 +35552,14 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX7-LABEL: s_vselect_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -25369,37 +35640,49 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GCN-LABEL: v_select_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v3bf16: @@ -25445,11 +35728,19 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GCN-LABEL: v_select_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 @@ -25466,15 +35757,23 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX7-LABEL: v_select_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -25527,13 +35826,25 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GCN-LABEL: v_select_v6bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 @@ -25555,19 +35866,31 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX7-LABEL: v_select_v6bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc @@ -25627,6 +35950,23 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GCN-LABEL: v_select_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -25635,7 +35975,6 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16 ; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 @@ -25662,23 +36001,39 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX7-LABEL: v_select_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13 ; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v13, 16 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v15, 16 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -25744,47 +36099,81 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-LABEL: v_select_v16bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; GCN-NEXT: v_alignbit_b32 v6, v6, v21, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; GCN-NEXT: v_alignbit_b32 v8, v8, v23, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; GCN-NEXT: v_alignbit_b32 v10, v10, v25, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_alignbit_b32 v12, v17, v27, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 ; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v14, v18, v29, 16 +; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16 ; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -25799,10 +36188,8 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v19 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16 ; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -25811,44 +36198,74 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-LABEL: v_select_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v23, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 ; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23 ; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27 +; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_alignbit_b32 v6, v6, v21, 16 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v25, 16 -; GFX7-NEXT: v_alignbit_b32 v17, v17, v27, 16 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_alignbit_b32 v14, v19, v29, 16 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 ; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -25860,17 +36277,19 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -25939,156 +36358,220 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GCN-LABEL: v_select_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_alignbit_b32 v5, v5, v9, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v12 -; GCN-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; GCN-NEXT: v_alignbit_b32 v7, v7, v13, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; GCN-NEXT: v_alignbit_b32 v9, v9, v17, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v20 -; GCN-NEXT: v_alignbit_b32 v10, v10, v19, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v22 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v24 -; GCN-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; GCN-NEXT: v_alignbit_b32 v13, v13, v25, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v28 -; GCN-NEXT: v_alignbit_b32 v14, v14, v27, 16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GCN-NEXT: v_alignbit_b32 v15, v15, v29, 16 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_alignbit_b32 v16, v16, v19, 16 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_alignbit_b32 v17, v17, v21, 16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v22 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_alignbit_b32 v18, v21, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v23, 16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v23, v23, v25, 16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v25, v25, v27, 16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v27, v27, v29, 16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v29, v29, v31, 16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc -; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v15, vcc -; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v14, vcc -; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v12, vcc -; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v11, vcc -; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v10, vcc -; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v15, v22, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc +; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -26103,8 +36586,8 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24 @@ -26126,173 +36609,241 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX7-LABEL: v_select_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v16 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; GFX7-NEXT: v_alignbit_b32 v7, v7, v13, 16 -; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v15, 16 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:8 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: v_alignbit_b32 v9, v9, v17, 16 -; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(13) +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: s_waitcnt vmcnt(12) +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: s_waitcnt vmcnt(11) +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: s_waitcnt vmcnt(9) +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: s_waitcnt vmcnt(7) +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX7-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v16, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v13, v4, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v16 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v8, vcc +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v9, v14, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v12, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v16 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v17 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16 ; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc @@ -26408,18 +36959,20 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: s_lshr_b32 s0, s4, 16 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NEXT: s_lshr_b32 s3, s5, 16 -; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_readfirstlane_b32 s1, v0 @@ -26427,18 +36980,20 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s4, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: v_alignbit_b32 v1, s1, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: s_lshr_b32 s1, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX7-NEXT: v_readfirstlane_b32 s0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 @@ -26513,18 +37068,22 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: s_lshr_b32 s0, s5, 16 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, s3, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, s2, v4, 16 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -26534,18 +37093,22 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX7-LABEL: s_select_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX7-NEXT: s_lshr_b32 s0, s3, 16 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: v_alignbit_b32 v3, s0, v3, 16 -; GFX7-NEXT: s_lshr_b32 s0, s7, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: v_alignbit_b32 v1, s1, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v4, s0, v4, 16 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -26616,22 +37179,22 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) { ; GCN-LABEL: s_vselect_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v4, s7 -; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6 +; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s2 +; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -26644,27 +37207,27 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX7-LABEL: s_vselect_v4bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3 +; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2 +; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1 +; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 @@ -26796,9 +37359,17 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GCN-LABEL: v_vselect_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc @@ -26819,13 +37390,21 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc @@ -26938,13 +37517,29 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GCN-LABEL: v_vselect_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v5, 1, v5 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc @@ -26977,25 +37572,41 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc @@ -27214,72 +37825,104 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v9 ; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v10 -; GCN-NEXT: v_and_b32_e32 v1, 1, v11 -; GCN-NEXT: v_and_b32_e32 v2, 1, v12 -; GCN-NEXT: v_and_b32_e32 v3, 1, v13 -; GCN-NEXT: v_and_b32_e32 v4, 1, v14 -; GCN-NEXT: v_and_b32_e32 v5, 1, v15 -; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 -; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v15, v3, v2, s[34:35] -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v14, v4, v30, s[30:31] -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v13, v5, v29, s[28:29] -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v12, v2, v28, s[26:27] -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v11, v3, v27, s[24:25] -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v10, v4, v26, s[22:23] -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v9, v2, v25, s[20:21] -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v8, v3, v24, s[18:19] -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_and_b32_e32 v1, 1, v10 +; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; GCN-NEXT: v_and_b32_e32 v3, 1, v11 +; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v5, 1, v12 +; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19 +; GCN-NEXT: v_and_b32_e32 v7, 1, v13 +; GCN-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 +; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_and_b32_e32 v9, 1, v15 +; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v7, v4, v23, s[16:17] -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v6, v2, v22, s[14:15] -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v5, v3, v21, s[12:13] -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[10:11] +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35] +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31] +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29] +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27] +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25] +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23] +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cndmask_b32_e64 v19, v2, v19, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v2, v1, v18, s[6:7] +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21] +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19] +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17] +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15] +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13] +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v17, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -27303,7 +37946,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -27329,87 +37972,122 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v11 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX7-NEXT: v_writelane_b32 v32, s30, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX7-NEXT: v_writelane_b32 v32, s31, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX7-NEXT: v_writelane_b32 v32, s34, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX7-NEXT: v_writelane_b32 v32, s35, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 -; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 -; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[8:9] -; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[10:11] -; GFX7-NEXT: s_waitcnt vmcnt(11) -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[12:13] -; GFX7-NEXT: s_waitcnt vmcnt(10) -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[14:15] -; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[16:17] -; GFX7-NEXT: s_waitcnt vmcnt(8) -; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[18:19] -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[20:21] -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[22:23] +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v12 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2 +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v13 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3 +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_and_b32_e32 v4, 1, v14 +; GFX7-NEXT: v_writelane_b32 v31, s34, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4 +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_and_b32_e32 v5, 1, v15 +; GFX7-NEXT: v_writelane_b32 v31, s35, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5 +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[24:25] +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[26:27] -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[28:29] -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[30:31] -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v15, v31, v15, s[34:35] -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35] +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31] +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29] +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27] +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25] +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_readlane_b32 s35, v32, 3 -; GFX7-NEXT: v_readlane_b32 s34, v32, 2 -; GFX7-NEXT: v_readlane_b32 s31, v32, 1 -; GFX7-NEXT: v_readlane_b32 s30, v32, 0 +; GFX7-NEXT: v_readlane_b32 s35, v31, 3 +; GFX7-NEXT: v_readlane_b32 s34, v31, 2 +; GFX7-NEXT: v_readlane_b32 s31, v31, 1 +; GFX7-NEXT: v_readlane_b32 s30, v31, 0 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23] +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21] +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24 +; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19] +; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17] +; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15] +; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13] +; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -27762,183 +38440,266 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-LABEL: v_vselect_v32bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 1, v30 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_and_b32_e32 v5, 1, v5 +; GCN-NEXT: v_and_b32_e32 v36, 1, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; GCN-NEXT: v_and_b32_e32 v29, 1, v29 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:252 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_and_b32_e32 v51, 1, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v51 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v31 -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_cndmask_b32_e64 v31, v34, v33, s[4:5] +; GCN-NEXT: v_and_b32_e32 v30, 1, v30 +; GCN-NEXT: v_and_b32_e32 v48, 1, v28 +; GCN-NEXT: v_and_b32_e32 v50, 1, v27 +; GCN-NEXT: v_and_b32_e32 v52, 1, v26 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:220 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:232 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:240 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:252 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v38 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v56 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30 +; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5] +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:244 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v47 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cndmask_b32_e64 v29, v36, v35, s[4:5] -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: v_and_b32_e32 v22, 1, v22 -; GCN-NEXT: v_and_b32_e32 v26, 1, v26 -; GCN-NEXT: v_and_b32_e32 v28, 1, v28 -; GCN-NEXT: v_and_b32_e32 v27, 1, v27 +; GCN-NEXT: v_cndmask_b32_e64 v29, v46, v38, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v48 +; GCN-NEXT: v_cndmask_b32_e64 v36, v37, v36, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v50 +; GCN-NEXT: v_cndmask_b32_e64 v37, v45, v44, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v52 +; GCN-NEXT: v_cndmask_b32_e64 v38, v40, v55, s[4:5] +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:140 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:152 +; GCN-NEXT: v_and_b32_e32 v9, 1, v9 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 +; GCN-NEXT: v_and_b32_e32 v21, 1, v21 ; GCN-NEXT: v_and_b32_e32 v25, 1, v25 ; GCN-NEXT: v_and_b32_e32 v24, 1, v24 ; GCN-NEXT: v_and_b32_e32 v23, 1, v23 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28 +; GCN-NEXT: v_and_b32_e32 v22, 1, v22 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v56 ; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_cndmask_b32_e64 v28, v38, v37, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e64 v27, v39, v51, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v26 -; GCN-NEXT: v_cndmask_b32_e64 v26, v36, v35, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v57 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25 -; GCN-NEXT: v_cndmask_b32_e64 v25, v49, v48, s[4:5] -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:212 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 +; GCN-NEXT: v_cndmask_b32_e64 v25, v54, v53, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cndmask_b32_e64 v24, v35, v50, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v24, v56, v47, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cndmask_b32_e64 v23, v36, v34, s[4:5] -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:204 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_and_b32_e32 v21, 1, v21 +; GCN-NEXT: v_cndmask_b32_e64 v23, v51, v49, s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22 +; GCN-NEXT: v_cndmask_b32_e64 v22, v39, v28, s[4:5] +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:200 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:204 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:216 ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: v_and_b32_e32 v17, 1, v17 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22 +; GCN-NEXT: v_and_b32_e32 v18, 1, v18 +; GCN-NEXT: v_and_b32_e32 v16, 1, v16 +; GCN-NEXT: v_and_b32_e32 v15, 1, v15 +; GCN-NEXT: v_and_b32_e32 v14, 1, v14 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e64 v22, v33, v50, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v58 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21 -; GCN-NEXT: v_cndmask_b32_e64 v21, v38, v37, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v21, v56, v47, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 -; GCN-NEXT: v_cndmask_b32_e64 v20, v48, v39, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v20, v58, v57, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19 -; GCN-NEXT: v_cndmask_b32_e64 v19, v49, v36, s[4:5] -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 +; GCN-NEXT: v_cndmask_b32_e64 v19, v54, v53, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 -; GCN-NEXT: v_cndmask_b32_e64 v18, v35, v34, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 +; GCN-NEXT: v_cndmask_b32_e64 v18, v51, v49, s[4:5] +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:180 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v60 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cndmask_b32_e64 v17, v33, v51, s[4:5] -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_and_b32_e32 v14, 1, v14 -; GCN-NEXT: v_and_b32_e32 v16, 1, v16 -; GCN-NEXT: v_and_b32_e32 v15, 1, v15 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 +; GCN-NEXT: v_cndmask_b32_e64 v17, v39, v28, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_cndmask_b32_e64 v16, v37, v36, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v16, v49, v59, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e64 v15, v38, v51, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 -; GCN-NEXT: v_cndmask_b32_e64 v14, v35, v34, s[4:5] -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:176 +; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5] +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:160 -; GCN-NEXT: v_and_b32_e32 v10, 1, v10 -; GCN-NEXT: v_and_b32_e32 v13, 1, v13 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v13 -; GCN-NEXT: v_cndmask_b32_e64 v13, v48, v39, s[4:5] -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 ; GCN-NEXT: v_and_b32_e32 v12, 1, v12 ; GCN-NEXT: v_and_b32_e32 v11, 1, v11 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v12, v50, v49, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cndmask_b32_e64 v11, v34, v33, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e64 v10, v48, v39, s[4:5] -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_and_b32_e32 v2, 1, v2 -; GCN-NEXT: v_and_b32_e32 v6, 1, v6 -; GCN-NEXT: v_and_b32_e32 v9, 1, v9 +; GCN-NEXT: v_and_b32_e32 v10, 1, v10 ; GCN-NEXT: v_and_b32_e32 v8, 1, v8 ; GCN-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN-NEXT: v_and_b32_e32 v6, 1, v6 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 ; GCN-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GCN-NEXT: v_cndmask_b32_e64 v9, v36, v35, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; GCN-NEXT: v_cndmask_b32_e64 v8, v38, v37, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v41, v42, v41, vcc +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 +; GCN-NEXT: v_and_b32_e32 v43, 1, v43 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e64 v7, v51, v50, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; GCN-NEXT: v_cndmask_b32_e64 v6, v49, v48, s[4:5] -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e32 v12, v53, v51, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GCN-NEXT: v_cndmask_b32_e32 v11, v31, v13, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GCN-NEXT: v_cndmask_b32_e32 v10, v59, v49, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GCN-NEXT: v_cndmask_b32_e32 v9, v39, v35, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e32 v8, v47, v54, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v7, v34, v33, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e32 v6, v32, v28, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GCN-NEXT: v_cndmask_b32_e32 v5, v46, v45, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cndmask_b32_e32 v4, v36, v35, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v57, v56, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cndmask_b32_e32 v3, v37, v39, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v55, v52, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v34, v33, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v50, v48, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cndmask_b32_e32 v1, v48, v38, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v44, v40, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cndmask_b32_e32 v0, v50, v49, vcc -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 1, v51 +; GCN-NEXT: v_cndmask_b32_e32 v0, v42, v58, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v43 +; GCN-NEXT: v_cndmask_b32_e32 v31, v27, v26, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -27952,7 +38713,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v41 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 @@ -27965,45 +38726,49 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v36 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v33 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e32 v32, v32, v30, vcc -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_vselect_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 +; GFX7-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25 +; GFX7-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30 +; GFX7-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29 ; GFX7-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28 ; GFX7-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27 ; GFX7-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX7-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26 ; GFX7-NEXT: v_and_b32_e32 v23, 1, v23 ; GFX7-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v28 -; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v27 -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v26 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v24 -; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v23 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v22 -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:240 -; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX7-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX7-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29 ; GFX7-NEXT: v_and_b32_e32 v21, 1, v21 ; GFX7-NEXT: v_and_b32_e32 v20, 1, v20 ; GFX7-NEXT: v_and_b32_e32 v19, 1, v19 @@ -28026,183 +38791,286 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_cndmask_b32_e64 v30, v27, v28, s[4:5] -; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 -; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_cndmask_b32_e64 v28, v23, v24, s[6:7] -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 -; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_cndmask_b32_e32 v29, v25, v26, vcc -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_cndmask_b32_e64 v27, v22, v27, s[8:9] -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e64 v26, v23, v25, s[10:11] -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:232 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v25, v23, v22, s[12:13] -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v24, v23, v22, s[14:15] -; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5] +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v23, v22, v23, s[16:17] +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v22, v31, v22, s[18:19] -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v31, 1, v31 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v31 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 ; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 ; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v19, v19, v32, vcc +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v15, v15, v32, vcc +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 ; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 -; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v11, v11, v32, vcc +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v32, vcc +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -29183,6 +40051,9 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fma_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -29193,6 +40064,9 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX7-LABEL: v_fma_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -29207,6 +40081,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -29217,6 +40098,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -29226,8 +40114,14 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v1, v2, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fma_bf16: @@ -29236,9 +40130,17 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %op @@ -29248,6 +40150,12 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GCN-LABEL: v_fma_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -29263,6 +40171,12 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX7-LABEL: v_fma_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -29281,11 +40195,25 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -29296,11 +40224,24 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -29314,9 +40255,20 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX10-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 +; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v3, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fma_v2bf16: @@ -29328,9 +40280,23 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4 -; GFX11-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v3, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) ret <2 x bfloat> %op @@ -29340,6 +40306,15 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GCN-LABEL: v_fma_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -29360,9 +40335,18 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX7-LABEL: v_fma_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 @@ -29384,14 +40368,36 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -29404,14 +40410,33 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -29420,20 +40445,36 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX10-LABEL: v_fma_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 -; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1 +; GFX10-NEXT: v_and_or_b32 v3, v6, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_and_or_b32 v8, v5, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op @@ -29443,6 +40484,18 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GCN-LABEL: v_fma_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -29468,13 +40521,25 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX7-LABEL: v_fma_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 @@ -29496,20 +40561,49 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 @@ -29521,19 +40615,44 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -29547,43 +40666,89 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX10-NEXT: v_fmac_f32_e32 v9, v11, v10 -; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v4, v9, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v5, v6, 0x7060302 +; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff +; GFX10-NEXT: v_and_or_b32 v1, v6, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX10-NEXT: v_and_or_b32 v9, v5, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff +; GFX10-NEXT: v_and_or_b32 v3, v7, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v4, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fma_v4bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_fmac_f32 v9, v11, v10 :: v_dual_lshlrev_b32 v6, 16, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_dual_fmac_f32 v4, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v4, v9, 0x7060302 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX11-NEXT: v_perm_b32 v1, v5, v6, 0x7060302 +; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_and_or_b32 v1, v6, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX11-NEXT: v_add3_u32 v0, v10, v6, 0x7fff +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8 +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff +; GFX11-NEXT: v_and_or_b32 v9, v5, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v6, v8, v4, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff +; GFX11-NEXT: v_and_or_b32 v3, v7, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %op @@ -29598,6 +40763,9 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fmuladd_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -29610,8 +40778,11 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX7-LABEL: v_fmuladd_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 @@ -29625,9 +40796,23 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -29637,9 +40822,22 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -29648,10 +40846,21 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v3, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -29660,11 +40869,25 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c) @@ -29675,6 +40898,12 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GCN-LABEL: v_fmuladd_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 @@ -29694,10 +40923,16 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX7-LABEL: v_fmuladd_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -29715,16 +40950,45 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -29734,16 +40998,41 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -29755,14 +41044,35 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX10-NEXT: v_and_or_b32 v5, v3, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_and_or_b32 v6, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -29773,16 +41083,43 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v5, v3, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v6, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v4, v1, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) @@ -29793,6 +41130,15 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GCN-LABEL: v_fmuladd_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 @@ -29819,12 +41165,21 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX7-LABEL: v_fmuladd_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 @@ -29848,21 +41203,64 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 @@ -29874,21 +41272,58 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 @@ -29904,19 +41339,50 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: s_brev_b32 s4, 1 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_and_or_b32 v9, v3, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_and_or_b32 v10, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX10-NEXT: v_and_or_b32 v7, v2, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op @@ -29926,6 +41392,18 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GCN-LABEL: v_fmuladd_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 @@ -29959,6 +41437,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX7-LABEL: v_fmuladd_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -29967,6 +41453,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5 @@ -29994,29 +41484,86 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 @@ -30027,28 +41574,77 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -30061,62 +41657,147 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: v_mul_f32_e32 v3, v8, v7 +; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX10-NEXT: v_and_or_b32 v3, v6, s4, 0x400000 +; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_and_or_b32 v10, v7, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_and_or_b32 v12, v0, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff +; GFX10-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 -; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 +; GFX10-NEXT: v_and_or_b32 v5, v3, s4, 0x400000 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff +; GFX10-NEXT: v_and_or_b32 v6, v2, s4, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmuladd_v4bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v2, v6, v2 :: v_dual_mul_f32 v1, v1, v3 -; GFX11-NEXT: v_mul_f32_e32 v3, v8, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX11-NEXT: v_and_or_b32 v3, v6, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v1, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_or_b32 v10, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff +; GFX11-NEXT: v_bfe_u32 v11, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v11, v11, v0, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_and_or_b32 v5, v3, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v7, v3, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo +; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v7, v2, 0x7fff +; GFX11-NEXT: v_and_or_b32 v6, v2, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff +; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %op diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index a69fb35f8f0cb..cfe1e46bf2c5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -787,6 +787,13 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -796,7 +803,14 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext half %arg0 to float diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 9a8ddb5bd3831..cd1ec85eb8d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1233,8 +1233,12 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 38bfee961dd29..db89ad66ffab0 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -2775,9 +2775,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:4 ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 @@ -2791,8 +2791,9 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; CI-NEXT: v_and_b32_e32 v0, 1, v17 -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 @@ -2801,7 +2802,7 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v20, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -3108,22 +3109,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -3132,21 +3125,35 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v8, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; CI-NEXT: v_mul_f32_e32 v10, 1.0, v16 +; CI-NEXT: v_mul_f32_e32 v11, 1.0, v17 +; CI-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; CI-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; CI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v17 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; CI-NEXT: buffer_store_short v14, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v12, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v5, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v4, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v3, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v11, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v10, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v9, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -4633,6 +4640,7 @@ define void @void_func_bf16(bfloat %arg0) #0 { ; CI-LABEL: void_func_bf16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -4664,7 +4672,9 @@ define void @void_func_v2bf16(<2 x bfloat> %arg0) #0 { ; CI-LABEL: void_func_v2bf16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -4696,9 +4706,12 @@ define void @void_func_v3bf16(<3 x bfloat> %arg0) #0 { ; CI-LABEL: void_func_v3bf16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 @@ -4733,8 +4746,12 @@ define void @void_func_v4bf16(<4 x bfloat> %arg0) #0 { ; CI-LABEL: void_func_v4bf16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; CI-NEXT: v_alignbit_b32 v1, v1, v0, 16 ; CI-NEXT: s_mov_b32 s7, 0xf000 @@ -4767,10 +4784,18 @@ define void @void_func_v8bf16(<8 x bfloat> %arg0) #0 { ; CI-LABEL: void_func_v8bf16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16 ; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16 @@ -4805,21 +4830,37 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 { ; CI-LABEL: void_func_v16bf16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16 ; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16 ; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; CI-NEXT: v_alignbit_b32 v14, v0, v14, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; CI-NEXT: v_alignbit_b32 v13, v0, v12, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; CI-NEXT: v_alignbit_b32 v12, v0, v10, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; CI-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; CI-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; CI-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; CI-NEXT: v_alignbit_b32 v11, v0, v8, 16 +; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; CI-NEXT: v_alignbit_b32 v11, v0, v1, 16 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index 490167ee3635a..b88aa1ce33fb3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1504,26 +1504,33 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[0:1], 0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s2, s4, -4 -; GFX900-NEXT: s_mov_b32 s3, s5 -; GFX900-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX900-NEXT: s_and_b32 s4, s4, 3 -; GFX900-NEXT: s_lshl_b32 s4, s4, 3 -; GFX900-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX900-NEXT: s_not_b32 s5, s5 +; GFX900-NEXT: s_and_b32 s2, s6, -4 +; GFX900-NEXT: s_mov_b32 s3, s7 +; GFX900-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX900-NEXT: s_and_b32 s5, s6, 3 +; GFX900-NEXT: s_lshl_b32 s5, s5, 3 +; GFX900-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX900-NEXT: s_not_b32 s6, s6 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: v_mov_b32_e32 v1, s7 ; GFX900-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol @@ -1533,32 +1540,39 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX900-NEXT: s_cbranch_execnz .LBB10_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX900-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX900-NEXT: global_store_short v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 +; GFX908-NEXT: s_movk_i32 s4, 0x7fff ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_and_b32 s2, s4, -4 -; GFX908-NEXT: s_mov_b32 s3, s5 -; GFX908-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX908-NEXT: s_and_b32 s4, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, s4, 3 -; GFX908-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX908-NEXT: s_not_b32 s5, s5 +; GFX908-NEXT: s_and_b32 s2, s6, -4 +; GFX908-NEXT: s_mov_b32 s3, s7 +; GFX908-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX908-NEXT: s_and_b32 s5, s6, 3 +; GFX908-NEXT: s_lshl_b32 s5, s5, 3 +; GFX908-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX908-NEXT: s_not_b32 s6, s6 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s7 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol @@ -1568,32 +1582,39 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX908-NEXT: global_store_short v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_movk_i32 s4, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s2, s4, -4 -; GFX90A-NEXT: s_mov_b32 s3, s5 -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX90A-NEXT: s_and_b32 s4, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX90A-NEXT: s_not_b32 s5, s5 +; GFX90A-NEXT: s_and_b32 s2, s6, -4 +; GFX90A-NEXT: s_mov_b32 s3, s7 +; GFX90A-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX90A-NEXT: s_and_b32 s5, s6, 3 +; GFX90A-NEXT: s_lshl_b32 s5, s5, 3 +; GFX90A-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX90A-NEXT: s_not_b32 s6, s6 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s7 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s5, v1 +; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v1 ; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1603,7 +1624,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX90A-NEXT: global_store_short v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -1611,6 +1632,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s2, -4 ; GFX10-NEXT: s_mov_b32 s1, s3 @@ -1627,6 +1649,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc @@ -1646,6 +1673,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: s_brev_b32 s5, 1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s2, -4 @@ -1658,12 +1686,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, s2, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 ; GFX11-NEXT: v_and_or_b32 v1, v2, s4, v1 @@ -1690,26 +1724,33 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[0:1], 0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s2, s4, -4 -; GFX900-NEXT: s_mov_b32 s3, s5 -; GFX900-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX900-NEXT: s_and_b32 s4, s4, 3 -; GFX900-NEXT: s_lshl_b32 s4, s4, 3 -; GFX900-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX900-NEXT: s_not_b32 s5, s5 +; GFX900-NEXT: s_and_b32 s2, s6, -4 +; GFX900-NEXT: s_mov_b32 s3, s7 +; GFX900-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX900-NEXT: s_and_b32 s5, s6, 3 +; GFX900-NEXT: s_lshl_b32 s5, s5, 3 +; GFX900-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX900-NEXT: s_not_b32 s6, s6 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s6 +; GFX900-NEXT: v_mov_b32_e32 v1, s7 ; GFX900-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX900-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol @@ -1719,32 +1760,39 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX900-NEXT: s_cbranch_execnz .LBB11_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX900-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX900-NEXT: global_store_short v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 +; GFX908-NEXT: s_movk_i32 s4, 0x7fff ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_and_b32 s2, s4, -4 -; GFX908-NEXT: s_mov_b32 s3, s5 -; GFX908-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX908-NEXT: s_and_b32 s4, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, s4, 3 -; GFX908-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX908-NEXT: s_not_b32 s5, s5 +; GFX908-NEXT: s_and_b32 s2, s6, -4 +; GFX908-NEXT: s_mov_b32 s3, s7 +; GFX908-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX908-NEXT: s_and_b32 s5, s6, 3 +; GFX908-NEXT: s_lshl_b32 s5, s5, 3 +; GFX908-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX908-NEXT: s_not_b32 s6, s6 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, s7 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s5, v1 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol @@ -1754,32 +1802,39 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX908-NEXT: global_store_short v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_movk_i32 s4, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s2, s4, -4 -; GFX90A-NEXT: s_mov_b32 s3, s5 -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x0 -; GFX90A-NEXT: s_and_b32 s4, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX90A-NEXT: s_not_b32 s5, s5 +; GFX90A-NEXT: s_and_b32 s2, s6, -4 +; GFX90A-NEXT: s_mov_b32 s3, s7 +; GFX90A-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX90A-NEXT: s_and_b32 s5, s6, 3 +; GFX90A-NEXT: s_lshl_b32 s5, s5, 3 +; GFX90A-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX90A-NEXT: s_not_b32 s6, s6 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s7 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s5, v1 +; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v1 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1791,7 +1846,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX90A-NEXT: global_store_short v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -1799,6 +1854,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_brev_b32 s5, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s2, -4 ; GFX10-NEXT: s_mov_b32 s1, s3 @@ -1815,6 +1871,11 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc @@ -1834,6 +1895,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: s_brev_b32 s5, 1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s2, -4 @@ -1846,12 +1908,18 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, s2, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 ; GFX11-NEXT: v_and_or_b32 v1, v2, s4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index 78db126fb2dc4..ad788b8d55014 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -3,10 +3,10 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s ; We only care about which physical registers the parameters are copied from; ; the function bodies are just some arbitrary uses. @@ -64,59 +64,113 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre ; GISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY12]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into `ptr poison`) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 - ; DAGISEL-GFX11-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 - ; DAGISEL-GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY]], 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = add <4 x i32> %a, %b store <4 x i32> %c, ptr poison ret void @@ -183,81 +237,157 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg ; GISEL-GFX10-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY15]], [[COPY11]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (p5) into %ir.b5, addrspace 5) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 - ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b) - ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; DAGISEL-GFX11-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3) - ; DAGISEL-GFX11-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; DAGISEL-GFX11-NEXT: SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 - ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b) - ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; DAGISEL-GFX10-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3) - ; DAGISEL-GFX10-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; DAGISEL-GFX10-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b) + ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; DAGISEL-GFX11-WF32-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3) + ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX11-WF32-NEXT: SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b) + ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; DAGISEL-GFX11-WF64-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3) + ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX11-WF64-NEXT: SCRATCH_STORE_DWORD [[COPY19]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.b5, addrspace 5) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b) + ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; DAGISEL-GFX10-WF32-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3) + ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX10-WF32-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_ptr + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $vgpr8, $vgpr9, $sgpr2, $sgpr3, $vgpr10, $vgpr11, $sgpr4, $vgpr12, $sgpr5, $vgpr13 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b) + ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY17]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s64) into %ir.b1, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; DAGISEL-GFX10-WF64-NEXT: DS_WRITE_B32_gfx9 [[COPY2]], [[COPY18]], 0, 0, implicit $exec :: (store (s32) into %ir.b3, addrspace 3) + ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX10-WF64-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY19]], [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.b5, addrspace 5) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 store ptr %a, ptr %b store ptr addrspace(1) %a1, ptr addrspace(1) %b1 store ptr addrspace(3) %a3, ptr addrspace(3) %b3 @@ -346,119 +476,233 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr, ; GISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY22]], [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (store (<4 x s32>) into `ptr addrspace(1) poison` + 16, addrspace 1) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_struct - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13 - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 - ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6 - ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 - ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3 - ; DAGISEL-GFX11-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; DAGISEL-GFX11-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]] - ; DAGISEL-GFX11-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]] - ; DAGISEL-GFX11-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]] - ; DAGISEL-GFX11-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_struct - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13 - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 - ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6 - ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 - ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] - ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] - ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3 - ; DAGISEL-GFX10-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] - ; DAGISEL-GFX10-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]] - ; DAGISEL-GFX10-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]] - ; DAGISEL-GFX10-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]] - ; DAGISEL-GFX10-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_struct + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_struct + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %p = extractvalue {ptr, i32, <4 x i32>} %a, 0 %i = extractvalue {ptr, i32, <4 x i32>} %a, 1 %v = extractvalue {ptr, i32, <4 x i32>} %a, 2 @@ -497,29 +741,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_float(float in ; GISEL-GFX10-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_float - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_float - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_float + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_float + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_float + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_float + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORD killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = fadd float %a, %b store float %c, ptr poison ret void @@ -552,29 +820,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre ; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_half - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_half - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = fadd half %a, %b store half %c, ptr poison ret void @@ -607,33 +899,93 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec - ; DAGISEL-GFX11-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc - ; DAGISEL-GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec - ; DAGISEL-GFX10-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc - ; DAGISEL-GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = fadd bfloat %a, %b store bfloat %c, ptr poison ret void @@ -666,29 +1018,53 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = add i16 %a, %b store i16 %c, ptr poison ret void @@ -787,101 +1163,197 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i ; GISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY27]], [[COPY25]], 16, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>) into `ptr poison` + 16, basealign 32) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14 - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13 - ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12 - ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7 - ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6 - ; DAGISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; DAGISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; DAGISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 - ; DAGISEL-GFX11-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX11-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) - ; DAGISEL-GFX11-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 - ; DAGISEL-GFX11-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]] - ; DAGISEL-GFX11-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX11-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14 - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13 - ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12 - ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 - ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7 - ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6 - ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5 - ; DAGISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4 - ; DAGISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2 - ; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 - ; DAGISEL-GFX10-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]] - ; DAGISEL-GFX10-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) - ; DAGISEL-GFX10-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 - ; DAGISEL-GFX10-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]] - ; DAGISEL-GFX10-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] - ; DAGISEL-GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_v16i16 + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr7 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr6 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY12]], 8, [[COPY4]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_1:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY13]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_2:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY14]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_3:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY15]], 8, [[COPY7]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_4:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY8]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]] + ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = add <16 x i16> %a, %b store <16 x i16> %c, ptr poison ret void @@ -922,45 +1394,85 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i ; GISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (<2 x s32>) into `ptr addrspace(1) poison`, addrspace 1) ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_many_regs - ; DAGISEL-GFX11: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135 - ; DAGISEL-GFX11-NEXT: {{ $}} - ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135 - ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35 - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; DAGISEL-GFX11-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]] - ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX11-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; DAGISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_many_regs - ; DAGISEL-GFX10: bb.0 (%ir-block.0): - ; DAGISEL-GFX10-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135 - ; DAGISEL-GFX10-NEXT: {{ $}} - ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135 - ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35 - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]] - ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; DAGISEL-GFX10-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs + ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135 + ; DAGISEL-GFX11-WF32-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]] + ; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs + ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135 + ; DAGISEL-GFX11-WF64-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]] + ; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_many_regs + ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF32-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135 + ; DAGISEL-GFX10-WF32-NEXT: {{ $}} + ; DAGISEL-GFX10-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]] + ; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX10-WF64-LABEL: name: amdgpu_cs_chain_preserve_many_regs + ; DAGISEL-GFX10-WF64: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-WF64-NEXT: liveins: $sgpr35, $vgpr8, $vgpr135 + ; DAGISEL-GFX10-WF64-NEXT: {{ $}} + ; DAGISEL-GFX10-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr135 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr35 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]] + ; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0 %c = extractelement <36 x i32> %a, i32 35 store i32 %c, ptr addrspace(1) poison diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 04bf2120b78cf..ea823f30f26c2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -161,6 +161,7 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: snan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -221,6 +222,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: qnan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -268,6 +270,7 @@ define i1 @posinf_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: posinf_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 @@ -311,6 +314,7 @@ define i1 @neginf_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: neginf_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: s_mov_b32 s4, 0xff80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 @@ -354,6 +358,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: posnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 @@ -418,6 +423,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 @@ -482,6 +488,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: possubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -531,6 +538,7 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negsubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0 @@ -594,6 +602,7 @@ define i1 @poszero_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: poszero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -634,6 +643,7 @@ define i1 @negzero_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negzero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: s_mov_b32 s4, 0x8000 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 @@ -677,6 +687,7 @@ define i1 @posfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: posfinite_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 @@ -720,6 +731,7 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negfinite_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 @@ -778,6 +790,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: isnan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -825,6 +838,7 @@ define i1 @not_isnan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isnan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -872,6 +886,8 @@ define <2 x i1> @isnan_v2bf16(<2 x bfloat> %x) nounwind { ; GFX7CHECK-LABEL: isnan_v2bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7CHECK-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -933,8 +949,11 @@ define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind { ; GFX7CHECK-LABEL: isnan_v3bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7CHECK-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7CHECK-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v2, v2, 16, 15 @@ -1009,10 +1028,14 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind { ; GFX7CHECK-LABEL: isnan_v4bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7CHECK-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7CHECK-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7CHECK-NEXT: v_bfe_u32 v2, v2, 16, 15 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v1 @@ -1104,6 +1127,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: isinf_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 @@ -1151,6 +1175,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: isfinite_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -1198,6 +1223,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: issubnormal_or_zero_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -1244,6 +1270,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_issubnormal_or_zero_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -1290,6 +1317,7 @@ define i1 @isnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: isnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1343,6 +1371,7 @@ define i1 @not_isnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1396,6 +1425,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_is_plus_normal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 @@ -1460,6 +1490,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_is_neg_normal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 @@ -1524,6 +1555,7 @@ define i1 @issubnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: issubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f @@ -1576,6 +1608,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_issubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7e @@ -1628,6 +1661,7 @@ define i1 @iszero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1672,6 +1706,7 @@ define i1 @not_iszero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1716,6 +1751,7 @@ define i1 @ispositive_bf16(bfloat %x) { ; GFX7CHECK-LABEL: ispositive_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 @@ -1759,6 +1795,7 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_ispositive_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -1841,6 +1878,7 @@ define i1 @isnegative_bf16(bfloat %x) { ; GFX7CHECK-LABEL: isnegative_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 @@ -1913,6 +1951,7 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isnegative_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 @@ -1974,6 +2013,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -2032,6 +2072,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX7CHECK-LABEL: iszero_or_nan_f_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -2090,6 +2131,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX7CHECK-LABEL: iszero_or_nan_f_maybe_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -2148,6 +2190,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -2206,6 +2249,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX7CHECK-LABEL: not_iszero_or_nan_f_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -2264,6 +2308,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX7CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -2322,6 +2367,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_or_qnan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -2380,6 +2426,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_or_snan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -2451,6 +2498,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_or_qnan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: s_movk_i32 s8, 0x7f80 @@ -2559,6 +2607,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_or_snan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 @@ -2657,6 +2706,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: isinf_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f7f ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 @@ -2705,6 +2755,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isinf_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -2753,6 +2804,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) { ; GFX7CHECK-LABEL: isfinite_or_nan_f: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 @@ -2801,6 +2853,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) { ; GFX7CHECK-LABEL: not_isfinite_or_nan_f: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 3be4665cf3a00..e906b5327c362 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1411,6 +1411,13 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: v_and_b32_e32 v5, v4, v2 ; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v3, v5, v3 @@ -1429,30 +1436,37 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX9-NEXT: ds_read_b32 v2, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: ds_read_b32 v3, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s4 -; GFX9-NEXT: v_not_b32_e32 v3, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v1, v4, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_ret_bf16: @@ -1544,6 +1558,13 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; VI-NEXT: v_bfe_u32 v6, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; VI-NEXT: v_and_b32_e32 v5, v3, v2 ; VI-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -1569,11 +1590,18 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 ; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index d76bb48b4a82a..590b40960faab 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4237,57 +4237,107 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2bf16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_mov_b32 s3, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] -; GFX9-NEXT: s_mov_b32 s0, 0x7060302 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7 -; GFX9-NEXT: v_fma_f32 v0, v8, v4, v0 +; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1 ; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 -; GFX9-NEXT: v_fma_f32 v1, v12, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2 +; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v7 +; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0x80000000, v8 +; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GFX9-NEXT: v_fma_f32 v0, v2, v10, v0 -; GFX9-NEXT: v_fma_f32 v2, v2, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1 -; GFX9-NEXT: v_fma_f32 v3, v3, v5, v7 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s0 -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5 +; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2 +; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v1 +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v2 +; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v4 +; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc +; GFX9-NEXT: v_perm_b32 v2, v4, v2, s3 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s3 +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fma_shuffle_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX10-NEXT: s_brev_b32 s2, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4296,73 +4346,162 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v9 ; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_fmac_f32_e32 v11, v10, v4 -; GFX10-NEXT: v_fmac_f32_e32 v1, v10, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9 +; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4 +; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX10-NEXT: v_and_or_b32 v8, v7, s2, 0x400000 +; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_and_or_b32 v12, v0, s2, 0x400000 +; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff +; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX10-NEXT: v_and_or_b32 v16, v1, s2, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX10-NEXT: v_and_or_b32 v14, v11, s2, 0x400000 +; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_fmac_f32_e32 v4, v2, v5 +; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo +; GFX10-NEXT: v_and_or_b32 v8, v4, s2, 0x400000 +; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v12 -; GFX10-NEXT: v_fmac_f32_e32 v4, v3, v12 -; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v5 -; GFX10-NEXT: v_fmac_f32_e32 v7, v2, v5 -; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5 +; GFX10-NEXT: v_and_or_b32 v3, v0, s2, 0x400000 +; GFX10-NEXT: v_and_or_b32 v10, v1, s2, 0x400000 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_and_or_b32 v12, v7, s2, 0x400000 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fma_shuffle_v2bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_brev_b32 s0, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5] -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[0:1] -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5] +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v1, v10, v9 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_fmac_f32 v1, v3, v5 :: v_dual_lshlrev_b32 v4, 16, v4 -; GFX11-NEXT: v_dual_fmac_f32 v11, v10, v4 :: v_dual_lshlrev_b32 v8, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_fmac_f32 v11, v12, v9 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_fmac_f32_e32 v1, v12, v4 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-NEXT: v_and_or_b32 v14, v11, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 +; GFX11-NEXT: v_and_or_b32 v16, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 +; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-NEXT: v_and_or_b32 v8, v7, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff +; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fmac_f32_e32 v1, v3, v10 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_dual_fmac_f32 v4, v3, v12 :: v_dual_fmac_f32 v7, v2, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v12 -; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v5 +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_and_or_b32 v12, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10 +; GFX11-NEXT: v_and_or_b32 v10, v1, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index a9faa130d6379..884860712632d 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK,SM80 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} From 0a518db99e0cffcdbb4cae73e27da87edbb25170 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Wed, 21 Feb 2024 09:39:31 -0800 Subject: [PATCH 127/351] [InstallAPI] Set InstallAPI as a standalone tool instead of CC1 action (#82293) Installapi has important distinctions when compared to the clang driver, so much that, it doesn't make much sense to try to integrate into it. This patch partially reverts the CC1 action & driver support to replace with its own driver as a clang tool. For distribution, we could use `LLVM_TOOL_LLVM_DRIVER_BUILD` mechanism for integrating the functionality into clang such that the toolchain size is less impacted. --- .../clang/Basic/DiagnosticDriverKinds.td | 3 - clang/include/clang/Driver/Action.h | 12 -- clang/include/clang/Driver/Options.td | 12 +- clang/include/clang/Driver/Types.def | 1 - .../include/clang/Frontend/CompilerInstance.h | 7 - .../clang/Frontend/CompilerInvocation.h | 9 +- .../include/clang/Frontend/FrontendActions.h | 10 -- .../include/clang/Frontend/FrontendOptions.h | 3 - .../clang/Frontend/InstallAPIOptions.h | 28 ---- clang/include/clang/InstallAPI/Context.h | 25 ---- clang/lib/Driver/Action.cpp | 7 - clang/lib/Driver/Driver.cpp | 16 +-- clang/lib/Driver/ToolChain.cpp | 1 - clang/lib/Driver/ToolChains/Clang.cpp | 11 -- clang/lib/Frontend/CMakeLists.txt | 3 - clang/lib/Frontend/CompilerInvocation.cpp | 41 +----- clang/lib/Frontend/InstallAPIConsumer.cpp | 43 ------ .../ExecuteCompilerInvocation.cpp | 2 - clang/lib/InstallAPI/CMakeLists.txt | 1 - clang/lib/InstallAPI/Context.cpp | 27 ---- clang/test/CMakeLists.txt | 1 + clang/test/Driver/installapi.h | 13 -- clang/test/InstallAPI/installapi-basic.test | 39 +++++- .../installapi-driver-invalid-options.test | 4 + clang/test/lit.cfg.py | 1 + clang/tools/CMakeLists.txt | 1 + clang/tools/clang-installapi/CMakeLists.txt | 20 +++ .../clang-installapi/ClangInstallAPI.cpp | 121 ++++++++++++++++ clang/tools/clang-installapi/Options.cpp | 129 ++++++++++++++++++ clang/tools/clang-installapi/Options.h | 88 ++++++++++++ 30 files changed, 408 insertions(+), 271 deletions(-) delete mode 100644 clang/include/clang/Frontend/InstallAPIOptions.h delete mode 100644 clang/lib/Frontend/InstallAPIConsumer.cpp delete mode 100644 clang/lib/InstallAPI/Context.cpp delete mode 100644 clang/test/Driver/installapi.h create mode 100644 clang/test/InstallAPI/installapi-driver-invalid-options.test create mode 100644 clang/tools/clang-installapi/CMakeLists.txt create mode 100644 clang/tools/clang-installapi/ClangInstallAPI.cpp create mode 100644 clang/tools/clang-installapi/Options.cpp create mode 100644 clang/tools/clang-installapi/Options.h diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 0807d8877591a..b13181f6e7089 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -804,7 +804,4 @@ def warn_android_unversioned_fallback : Warning< def err_drv_triple_version_invalid : Error< "version '%0' in target triple '%1' is invalid">; - -def err_drv_installapi_unsupported : Error< - "InstallAPI is not supported for '%0'">; } diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 2768e2f5df1a9..04fa8b01b418f 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -59,7 +59,6 @@ class Action { PreprocessJobClass, PrecompileJobClass, ExtractAPIJobClass, - InstallAPIJobClass, AnalyzeJobClass, MigrateJobClass, CompileJobClass, @@ -449,17 +448,6 @@ class ExtractAPIJobAction : public JobAction { void addHeaderInput(Action *Input) { getInputs().push_back(Input); } }; -class InstallAPIJobAction : public JobAction { - void anchor() override; - -public: - InstallAPIJobAction(Action *Input, types::ID OutputType); - - static bool classof(const Action *A) { - return A->getKind() == InstallAPIJobClass; - } -}; - class AnalyzeJobAction : public JobAction { void anchor() override; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 36a42b1b050c2..3a028fadb25b1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -336,8 +336,6 @@ class AnalyzerOpts : KeyPathAndMacro<"AnalyzerOpts->", base, "ANALYZER_"> {} class MigratorOpts : KeyPathAndMacro<"MigratorOpts.", base, "MIGRATOR_"> {} -class InstallAPIOpts - : KeyPathAndMacro<"InstallAPIOpts.", base, "INSTALLAPI_"> {} // A boolean option which is opt-in in CC1. The positive option exists in CC1 and // Args.hasArg(OPT_ffoo) can be used to check that the flag is enabled. @@ -1143,8 +1141,7 @@ def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, def coverage : Flag<["-", "--"], "coverage">, Group, Visibility<[ClangOption, CLOption]>; def cpp_precomp : Flag<["-"], "cpp-precomp">, Group; -def current__version : JoinedOrSeparate<["-"], "current_version">, - Visibility<[ClangOption, CC1Option]>; +def current__version : JoinedOrSeparate<["-"], "current_version">; def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group, HelpText<"Add directory to the C++ SYSTEM include search path">, Visibility<[ClangOption, CC1Option]>, @@ -1559,9 +1556,6 @@ def static_libsan : Flag<["-"], "static-libsan">, HelpText<"Statically link the sanitizer runtime (Not supported for ASan, TSan or UBSan on darwin)">; def : Flag<["-"], "shared-libasan">, Alias; def fasm : Flag<["-"], "fasm">, Group; -def installapi : Flag<["-"], "installapi">, - Visibility<[ClangOption, CC1Option]>, Group, - HelpText<"Create a text-based stub file by scanning header files">; defm assume_unique_vtables : BoolFOption<"assume-unique-vtables", CodeGenOpts<"AssumeUniqueVTables">, DefaultTrue, @@ -4320,9 +4314,7 @@ def verify_pch : Flag<["-"], "verify-pch">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Load and verify that a pre-compiled header file is not stale">; def init : Separate<["-"], "init">; -def install__name : Separate<["-"], "install_name">, - Visibility<[ClangOption, CC1Option]>, - MarshallingInfoString>; +def install__name : Separate<["-"], "install_name">; def iprefix : JoinedOrSeparate<["-"], "iprefix">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Set the -iwithprefix/-iwithprefixbefore prefix">, MetaVarName<"">; diff --git a/clang/include/clang/Driver/Types.def b/clang/include/clang/Driver/Types.def index 570a53441d1c7..f72c27e1ee701 100644 --- a/clang/include/clang/Driver/Types.def +++ b/clang/include/clang/Driver/Types.def @@ -94,7 +94,6 @@ TYPE("lto-bc", LTO_BC, INVALID, "o", phases TYPE("ast", AST, INVALID, "ast", phases::Compile, phases::Backend, phases::Assemble, phases::Link) TYPE("ifs", IFS, INVALID, "ifs", phases::IfsMerge) TYPE("ifs-cpp", IFS_CPP, INVALID, "ifs", phases::Compile, phases::IfsMerge) -TYPE("tbd", TextAPI, INVALID, "tbd", phases::Precompile) TYPE("pcm", ModuleFile, INVALID, "pcm", phases::Compile, phases::Backend, phases::Assemble, phases::Link) TYPE("header-unit", HeaderUnit, INVALID, "pcm", phases::Compile, phases::Backend, phases::Assemble, phases::Link) TYPE("plist", Plist, INVALID, "plist", phases::Compile, phases::Backend, phases::Assemble, phases::Link) diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index 6eb7972f86ca5..ac2f940769fbe 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -294,13 +294,6 @@ class CompilerInstance : public ModuleLoader { return Invocation->getFrontendOpts(); } - InstallAPIOptions &getInstallAPIOpts() { - return Invocation->getInstallAPIOpts(); - } - const InstallAPIOptions &getInstallAPIOpts() const { - return Invocation->getInstallAPIOpts(); - } - HeaderSearchOptions &getHeaderSearchOpts() { return Invocation->getHeaderSearchOpts(); } diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h index a01d9695dce20..c6528779bde7b 100644 --- a/clang/include/clang/Frontend/CompilerInvocation.h +++ b/clang/include/clang/Frontend/CompilerInvocation.h @@ -18,12 +18,11 @@ #include "clang/Basic/LangStandard.h" #include "clang/Frontend/DependencyOutputOptions.h" #include "clang/Frontend/FrontendOptions.h" -#include "clang/Frontend/InstallAPIOptions.h" #include "clang/Frontend/MigratorOptions.h" #include "clang/Frontend/PreprocessorOutputOptions.h" #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/ADT/ArrayRef.h" #include #include @@ -112,9 +111,6 @@ class CompilerInvocationBase { /// Options controlling preprocessed output. std::shared_ptr PreprocessorOutputOpts; - /// Options controlling InstallAPI operations and output. - std::shared_ptr InstallAPIOpts; - /// Dummy tag type whose instance can be passed into the constructor to /// prevent creation of the reference-counted option objects. struct EmptyConstructor {}; @@ -149,7 +145,6 @@ class CompilerInvocationBase { const PreprocessorOutputOptions &getPreprocessorOutputOpts() const { return *PreprocessorOutputOpts; } - const InstallAPIOptions &getInstallAPIOpts() const { return *InstallAPIOpts; } /// @} /// Command line generation. @@ -242,7 +237,6 @@ class CompilerInvocation : public CompilerInvocationBase { using CompilerInvocationBase::getFrontendOpts; using CompilerInvocationBase::getDependencyOutputOpts; using CompilerInvocationBase::getPreprocessorOutputOpts; - using CompilerInvocationBase::getInstallAPIOpts; /// @} /// Mutable getters. @@ -264,7 +258,6 @@ class CompilerInvocation : public CompilerInvocationBase { PreprocessorOutputOptions &getPreprocessorOutputOpts() { return *PreprocessorOutputOpts; } - InstallAPIOptions &getInstallAPIOpts() { return *InstallAPIOpts; } /// @} /// Base class internals. diff --git a/clang/include/clang/Frontend/FrontendActions.h b/clang/include/clang/Frontend/FrontendActions.h index b8229252f5ed2..fcce31ac0590f 100644 --- a/clang/include/clang/Frontend/FrontendActions.h +++ b/clang/include/clang/Frontend/FrontendActions.h @@ -130,16 +130,6 @@ class GenerateModuleAction : public ASTFrontendAction { bool shouldEraseOutputFiles() override; }; -class InstallAPIAction : public ASTFrontendAction { -protected: - std::unique_ptr CreateASTConsumer(CompilerInstance &CI, - StringRef InFile) override; - -public: - static std::unique_ptr - CreateOutputFile(CompilerInstance &CI, StringRef InFile); -}; - class GenerateInterfaceStubsAction : public ASTFrontendAction { protected: std::unique_ptr CreateASTConsumer(CompilerInstance &CI, diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h index 62d16ba542ea4..53a8681cfdbba 100644 --- a/clang/include/clang/Frontend/FrontendOptions.h +++ b/clang/include/clang/Frontend/FrontendOptions.h @@ -100,9 +100,6 @@ enum ActionKind { /// Only execute frontend initialization. InitOnly, - // Create TextAPI stub. - InstallAPI, - /// Dump information about a module file. ModuleFileInfo, diff --git a/clang/include/clang/Frontend/InstallAPIOptions.h b/clang/include/clang/Frontend/InstallAPIOptions.h deleted file mode 100644 index cf65a3350c6de..0000000000000 --- a/clang/include/clang/Frontend/InstallAPIOptions.h +++ /dev/null @@ -1,28 +0,0 @@ -//===--- InstallAPIOptions.h ------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CLANG_FRONTEND_INSTALLAPIOPTIONS_H -#define LLVM_CLANG_FRONTEND_INSTALLAPIOPTIONS_H - -#include "llvm/TextAPI/PackedVersion.h" - -namespace clang { - -/// InstallAPIOptions - Options for controlling InstallAPI verification and -/// TextAPI output. -class InstallAPIOptions { -public: - /// The install name which is apart of the library's ID. - std::string InstallName; - - /// The current version which is apart of the library's ID. - llvm::MachO::PackedVersion CurrentVersion; -}; -} // namespace clang - -#endif diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h index a1ff7c12a2f83..b06168918a613 100644 --- a/clang/include/clang/InstallAPI/Context.h +++ b/clang/include/clang/InstallAPI/Context.h @@ -5,18 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// Top level types for interacting with the generic clang driver and frontend -// for InstallAPI operations. -// -//===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_INSTALLAPI_CONTEXT_H #define LLVM_CLANG_INSTALLAPI_CONTEXT_H -#include "clang/AST/ASTConsumer.h" -#include "clang/Basic/Diagnostic.h" -#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/TextAPI/InterfaceFile.h" #include "llvm/TextAPI/RecordVisitor.h" #include "llvm/TextAPI/RecordsSlice.h" @@ -35,12 +27,6 @@ struct InstallAPIContext { /// Active target triple to parse. llvm::Triple TargetTriple{}; - /// Output stream to write TextAPI file to. - std::unique_ptr OS = nullptr; - - /// DiagnosticsEngine to report errors. - llvm::IntrusiveRefCntPtr Diags = nullptr; - /// File Path of output location. StringRef OutputLoc{}; @@ -48,17 +34,6 @@ struct InstallAPIContext { llvm::MachO::FileType FT = llvm::MachO::FileType::TBD_V5; }; -class InstallAPIConsumer : public ASTConsumer { -public: - InstallAPIConsumer(InstallAPIContext InstallAPICtx) - : Ctx(std::move(InstallAPICtx)) {} - - void HandleTranslationUnit(ASTContext &ASTContext) override; - -private: - InstallAPIContext Ctx; -}; - } // namespace installapi } // namespace clang diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp index 7b1a1bb0228c4..849bf6035ebd2 100644 --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -32,8 +32,6 @@ const char *Action::getClassName(ActionClass AC) { case CompileJobClass: return "compiler"; case BackendJobClass: return "backend"; case AssembleJobClass: return "assembler"; - case InstallAPIJobClass: - return "installapi"; case IfsMergeJobClass: return "interface-stub-merger"; case LinkJobClass: return "linker"; case LipoJobClass: return "lipo"; @@ -364,11 +362,6 @@ void ExtractAPIJobAction::anchor() {} ExtractAPIJobAction::ExtractAPIJobAction(Action *Inputs, types::ID OutputType) : JobAction(ExtractAPIJobClass, Inputs, OutputType) {} -void InstallAPIJobAction::anchor() {} - -InstallAPIJobAction::InstallAPIJobAction(Action *Inputs, types::ID OutputType) - : JobAction(InstallAPIJobClass, Inputs, OutputType) {} - void AnalyzeJobAction::anchor() {} AnalyzeJobAction::AnalyzeJobAction(Action *Input, types::ID OutputType) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 5a323bf4c0c5f..00e14071a4afe 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4189,11 +4189,6 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, break; } - if (isa(Current)) { - Current = nullptr; - break; - } - // FIXME: Should we include any prior module file outputs as inputs of // later actions in the same command line? @@ -4324,13 +4319,6 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, if (!MergerInputs.empty()) Actions.push_back( C.MakeAction(MergerInputs, types::TY_Image)); - } else if (Args.hasArg(options::OPT_installapi)) { - // TODO: Lift restriction once operation can handle multiple inputs. - assert(Inputs.size() == 1 && "InstallAPI action can only handle 1 input"); - const auto [InputType, InputArg] = Inputs.front(); - Action *Current = C.MakeAction(*InputArg, InputType); - Actions.push_back( - C.MakeAction(Current, types::TY_TextAPI)); } for (auto Opt : {options::OPT_print_supported_cpus, @@ -4774,8 +4762,6 @@ Action *Driver::ConstructPhaseAction( return C.MakeAction(Input, types::TY_Nothing); if (Args.hasArg(options::OPT_extract_api)) return C.MakeAction(Input, types::TY_API_INFO); - if (Args.hasArg(options::OPT_installapi)) - return C.MakeAction(Input, types::TY_TextAPI); return C.MakeAction(Input, types::TY_LLVM_BC); } case phases::Backend: { @@ -6455,7 +6441,7 @@ bool Driver::ShouldUseClangCompiler(const JobAction &JA) const { // And say "no" if this is not a kind of action clang understands. if (!isa(JA) && !isa(JA) && !isa(JA) && !isa(JA) && - !isa(JA) && !isa(JA)) + !isa(JA)) return false; return true; diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 657577cea6c7d..388030592b483 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -532,7 +532,6 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const { case Action::PrecompileJobClass: case Action::PreprocessJobClass: case Action::ExtractAPIJobClass: - case Action::InstallAPIJobClass: case Action::AnalyzeJobClass: case Action::MigrateJobClass: case Action::VerifyPCHJobClass: diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7daf945ae1271..7c0409f0c3097 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4939,17 +4939,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, if (Arg *ExtractAPIIgnoresFileArg = Args.getLastArg(options::OPT_extract_api_ignores_EQ)) ExtractAPIIgnoresFileArg->render(Args, CmdArgs); - } else if (isa(JA)) { - if (!Triple.isOSDarwin()) - D.Diag(diag::err_drv_installapi_unsupported) << Triple.str(); - - CmdArgs.push_back("-installapi"); - // Add necessary library arguments for InstallAPI. - if (const Arg *A = Args.getLastArg(options::OPT_install__name)) - A->render(Args, CmdArgs); - if (const Arg *A = Args.getLastArg(options::OPT_current__version)) - A->render(Args, CmdArgs); - } else { assert((isa(JA) || isa(JA)) && "Invalid action for clang tool."); diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt index f443d88b5d30c..a916667208845 100644 --- a/clang/lib/Frontend/CMakeLists.txt +++ b/clang/lib/Frontend/CMakeLists.txt @@ -7,7 +7,6 @@ set(LLVM_LINK_COMPONENTS ProfileData Support TargetParser - TextAPI ) add_clang_library(clangFrontend @@ -28,7 +27,6 @@ add_clang_library(clangFrontend HeaderIncludeGen.cpp InitPreprocessor.cpp LayoutOverrideSource.cpp - InstallAPIConsumer.cpp LogDiagnosticPrinter.cpp ModuleDependencyCollector.cpp MultiplexConsumer.cpp @@ -55,7 +53,6 @@ add_clang_library(clangFrontend clangBasic clangDriver clangEdit - clangInstallAPI clangLex clangParse clangSema diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index bcb31243056b7..8d7b75b56d612 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -149,8 +149,7 @@ CompilerInvocationBase::CompilerInvocationBase() FSOpts(std::make_shared()), FrontendOpts(std::make_shared()), DependencyOutputOpts(std::make_shared()), - PreprocessorOutputOpts(std::make_shared()), - InstallAPIOpts(std::make_shared()) {} + PreprocessorOutputOpts(std::make_shared()) {} CompilerInvocationBase & CompilerInvocationBase::deep_copy_assign(const CompilerInvocationBase &X) { @@ -168,7 +167,6 @@ CompilerInvocationBase::deep_copy_assign(const CompilerInvocationBase &X) { FrontendOpts = make_shared_copy(X.getFrontendOpts()); DependencyOutputOpts = make_shared_copy(X.getDependencyOutputOpts()); PreprocessorOutputOpts = make_shared_copy(X.getPreprocessorOutputOpts()); - InstallAPIOpts = make_shared_copy(X.getInstallAPIOpts()); } return *this; } @@ -189,7 +187,6 @@ CompilerInvocationBase::shallow_copy_assign(const CompilerInvocationBase &X) { FrontendOpts = X.FrontendOpts; DependencyOutputOpts = X.DependencyOutputOpts; PreprocessorOutputOpts = X.PreprocessorOutputOpts; - InstallAPIOpts = X.InstallAPIOpts; } return *this; } @@ -2161,34 +2158,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, return Diags.getNumErrors() == NumErrorsBefore; } -static bool ParseInstallAPIArgs(InstallAPIOptions &Opts, ArgList &Args, - DiagnosticsEngine &Diags, - frontend::ActionKind Action) { - unsigned NumErrorsBefore = Diags.getNumErrors(); - - InstallAPIOptions &InstallAPIOpts = Opts; -#define INSTALLAPI_OPTION_WITH_MARSHALLING(...) \ - PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" -#undef INSTALLAPI_OPTION_WITH_MARSHALLING - if (Arg *A = Args.getLastArg(options::OPT_current__version)) - Opts.CurrentVersion.parse64(A->getValue()); - - return Diags.getNumErrors() == NumErrorsBefore; -} - -static void GenerateInstallAPIArgs(const InstallAPIOptions &Opts, - ArgumentConsumer Consumer) { - const InstallAPIOptions &InstallAPIOpts = Opts; -#define INSTALLAPI_OPTION_WITH_MARSHALLING(...) \ - GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" -#undef INSTALLAPI_OPTION_WITH_MARSHALLING - if (!Opts.CurrentVersion.empty()) - GenerateArg(Consumer, OPT_current__version, - std::string(Opts.CurrentVersion)); -} - static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts, ArgumentConsumer Consumer) { const DependencyOutputOptions &DependencyOutputOpts = Opts; @@ -2588,7 +2557,6 @@ static const auto &getFrontendActionTable() { {frontend::GeneratePCH, OPT_emit_pch}, {frontend::GenerateInterfaceStubs, OPT_emit_interface_stubs}, {frontend::InitOnly, OPT_init_only}, - {frontend::InstallAPI, OPT_installapi}, {frontend::ParseSyntaxOnly, OPT_fsyntax_only}, {frontend::ModuleFileInfo, OPT_module_file_info}, {frontend::VerifyPCH, OPT_verify_pch}, @@ -4312,7 +4280,6 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) { case frontend::GenerateHeaderUnit: case frontend::GeneratePCH: case frontend::GenerateInterfaceStubs: - case frontend::InstallAPI: case frontend::ParseSyntaxOnly: case frontend::ModuleFileInfo: case frontend::VerifyPCH: @@ -4687,11 +4654,6 @@ bool CompilerInvocation::CreateFromArgsImpl( Res.getDependencyOutputOpts().Targets.empty()) Diags.Report(diag::err_fe_dependency_file_requires_MT); - if (Args.hasArg(OPT_installapi)) { - ParseInstallAPIArgs(Res.getInstallAPIOpts(), Args, Diags, - Res.getFrontendOpts().ProgramAction); - } - // If sanitizer is enabled, disable OPT_ffine_grained_bitfield_accesses. if (Res.getCodeGenOpts().FineGrainedBitfieldAccesses && !Res.getLangOpts().Sanitize.empty()) { @@ -4882,7 +4844,6 @@ void CompilerInvocationBase::generateCC1CommandLine( GeneratePreprocessorOutputArgs(getPreprocessorOutputOpts(), Consumer, getFrontendOpts().ProgramAction); GenerateDependencyOutputArgs(getDependencyOutputOpts(), Consumer); - GenerateInstallAPIArgs(getInstallAPIOpts(), Consumer); } std::vector CompilerInvocationBase::getCC1CommandLine() const { diff --git a/clang/lib/Frontend/InstallAPIConsumer.cpp b/clang/lib/Frontend/InstallAPIConsumer.cpp deleted file mode 100644 index c0f22c1a589f3..0000000000000 --- a/clang/lib/Frontend/InstallAPIConsumer.cpp +++ /dev/null @@ -1,43 +0,0 @@ -//===--- InstallAPIConsumer.cpp -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang/Frontend/CompilerInstance.h" -#include "clang/Frontend/FrontendActions.h" -#include "clang/InstallAPI/Context.h" - -using namespace clang; -using namespace clang::installapi; - -std::unique_ptr -InstallAPIAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { - const InstallAPIOptions &Opts = CI.getInstallAPIOpts(); - InstallAPIContext Ctx; - Ctx.BA.InstallName = Opts.InstallName; - Ctx.BA.AppExtensionSafe = CI.getLangOpts().AppExt; - Ctx.BA.CurrentVersion = Opts.CurrentVersion; - // InstallAPI requires two level namespacing. - Ctx.BA.TwoLevelNamespace = true; - Ctx.TargetTriple = CI.getTarget().getTriple(); - - Ctx.Diags = &CI.getDiagnostics(); - Ctx.OutputLoc = CI.getFrontendOpts().OutputFile; - Ctx.OS = CreateOutputFile(CI, InFile); - if (!Ctx.OS) - return nullptr; - return std::make_unique(std::move(Ctx)); -} - -std::unique_ptr -InstallAPIAction::CreateOutputFile(CompilerInstance &CI, StringRef InFile) { - std::unique_ptr OS = - CI.createDefaultOutputFile(/*Binary=*/false, InFile, /*Extension=*/"tbd", - /*RemoveFileOnSignal=*/false); - if (!OS) - return nullptr; - return OS; -} diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index a47c474e520a0..925879a68cbd0 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -71,8 +71,6 @@ CreateFrontendBaseAction(CompilerInstance &CI) { case GenerateInterfaceStubs: return std::make_unique(); case InitOnly: return std::make_unique(); - case InstallAPI: - return std::make_unique(); case ParseSyntaxOnly: return std::make_unique(); case ModuleFileInfo: return std::make_unique(); case VerifyPCH: return std::make_unique(); diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt index 6c9cb4b559f67..fdc4f064f29e9 100644 --- a/clang/lib/InstallAPI/CMakeLists.txt +++ b/clang/lib/InstallAPI/CMakeLists.txt @@ -4,7 +4,6 @@ set(LLVM_LINK_COMPONENTS ) add_clang_library(clangInstallAPI - Context.cpp FileList.cpp HeaderFile.cpp diff --git a/clang/lib/InstallAPI/Context.cpp b/clang/lib/InstallAPI/Context.cpp deleted file mode 100644 index d4df52f66560c..0000000000000 --- a/clang/lib/InstallAPI/Context.cpp +++ /dev/null @@ -1,27 +0,0 @@ -//===--- InstallAPI/Context.cpp -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang/InstallAPI/Context.h" -#include "clang/AST/ASTContext.h" -#include "llvm/TextAPI/TextAPIWriter.h" - -using namespace clang; -using namespace clang::installapi; -using namespace llvm::MachO; - -void InstallAPIConsumer::HandleTranslationUnit(ASTContext &Context) { - if (Context.getDiagnostics().hasErrorOccurred()) - return; - InterfaceFile IF; - // Set library attributes captured through cc1 args. - Target T(Ctx.TargetTriple); - IF.addTarget(T); - IF.setFromBinaryAttrs(Ctx.BA, T); - if (auto Err = TextAPIWriter::writeToStream(*Ctx.OS, IF, Ctx.FT)) - Ctx.Diags->Report(diag::err_cannot_open_file) << Ctx.OutputLoc; -} diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index 841317cef880a..6b5cb0a18457b 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -71,6 +71,7 @@ list(APPEND CLANG_TEST_DEPS clang-rename clang-refactor clang-diff + clang-installapi clang-scan-deps clang-linker-wrapper clang-offload-bundler diff --git a/clang/test/Driver/installapi.h b/clang/test/Driver/installapi.h deleted file mode 100644 index 99379b44d1379..0000000000000 --- a/clang/test/Driver/installapi.h +++ /dev/null @@ -1,13 +0,0 @@ -// Check non-darwin triple is rejected. -// RUN: not %clang -target x86_64-unknown-unknown -installapi %s 2> %t -// RUN: FileCheck --check-prefix INVALID_INSTALLAPI -input-file %t %s - -// INVALID_INSTALLAPI: error: InstallAPI is not supported for 'x86_64-unknown-unknown' - -// Check installapi phases. -// RUN: %clang -target x86_64-apple-macos11 -ccc-print-phases -installapi %s 2> %t -// RUN: FileCheck --check-prefix INSTALLAPI_PHASES -input-file %t %s - -// INSTALLAPI_PHASES: 0: input, -// INSTALLAPI_PHASES: installapi, -// INSTALLAPI_PHASES-SAME: tbd diff --git a/clang/test/InstallAPI/installapi-basic.test b/clang/test/InstallAPI/installapi-basic.test index 8035166d076da..22b04792ca2c3 100644 --- a/clang/test/InstallAPI/installapi-basic.test +++ b/clang/test/InstallAPI/installapi-basic.test @@ -1,10 +1,17 @@ // RUN: rm -rf %t // RUN: split-file %s %t -// RUN: %clang_cc1 -x objective-c -triple arm64-apple-ios13.0.0 -installapi \ +/// Check basic arguments are captured. +// RUN: clang-installapi -x objective-c -target arm64-apple-ios13.0.0 \ // RUN: -fapplication-extension -current_version 1 -install_name /usr/lib/basic.dylib \ // RUN: %t/basic_inputs.json -o %t/basic.tbd 2>&1 | FileCheck %s --allow-empty // RUN: llvm-readtapi -compare %t/basic.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty +/// Check multiple targets are captured. +// RUN: clang-installapi -x objective-c -target arm64-apple-ios14.1 -target arm64e-apple-ios14.1 \ +// RUN: -fapplication-extension -install_name /usr/lib/basic.dylib \ +// RUN: %t/basic_inputs.json -o %t/multi-targets.tbd 2>&1 | FileCheck %s --allow-empty +// RUN: llvm-readtapi -compare %t/multi-targets.tbd %t/expected-multi.tbd 2>&1 | FileCheck %s --allow-empty + // CHECK-NOT: error: // CHECK-NOT: warning: @@ -32,3 +39,33 @@ }, "tapi_tbd_version": 5 } + +//--- expected-multi.tbd +{ + "main_library": { + "compatibility_versions": [ + { + "version": "0" + }], + "current_versions": [ + { + "version": "0" + }], + "install_names": [ + { + "name": "/usr/lib/basic.dylib" + } + ], + "target_info": [ + { + "min_deployment": "14.1", + "target": "arm64-ios" + }, + { + "min_deployment": "14.1", + "target": "arm64e-ios" + } + ] + }, + "tapi_tbd_version": 5 +} diff --git a/clang/test/InstallAPI/installapi-driver-invalid-options.test b/clang/test/InstallAPI/installapi-driver-invalid-options.test new file mode 100644 index 0000000000000..a2e008e1eb03e --- /dev/null +++ b/clang/test/InstallAPI/installapi-driver-invalid-options.test @@ -0,0 +1,4 @@ +/// Check non-darwin triple is rejected. +// RUN: not clang-installapi -target x86_64-unknown-unknown %s 2> %t +// RUN: FileCheck --check-prefix INVALID_INSTALLAPI -input-file %t %s +// INVALID_INSTALLAPI: error: unsupported option 'installapi' for target 'x86_64-unknown-unknown' diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index f93b5d9c94588..e5630a07424c7 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -90,6 +90,7 @@ "clang-offload-packager", "clang-tblgen", "clang-scan-deps", + "clang-installapi", "opt", "llvm-ifs", "yaml2obj", diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index f60db6ef0ba34..bdd8004be3e02 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_subdirectory(clang-linker-wrapper) add_clang_subdirectory(clang-offload-packager) add_clang_subdirectory(clang-offload-bundler) add_clang_subdirectory(clang-scan-deps) +add_clang_subdirectory(clang-installapi) if(HAVE_CLANG_REPL_SUPPORT) add_clang_subdirectory(clang-repl) endif() diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt new file mode 100644 index 0000000000000..c8dd56db101da --- /dev/null +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + Support + TargetParser + TextAPI + ) + +add_clang_tool(clang-installapi + ClangInstallAPI.cpp + Options.cpp + + GENERATE_DRIVER + ) + +clang_target_link_libraries(clang-installapi + PRIVATE + clangInstallAPI + clangDriver + clangFrontend + clangTooling + ) diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp new file mode 100644 index 0000000000000..fc23ffd7ae6b9 --- /dev/null +++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp @@ -0,0 +1,121 @@ +//===-- ClangInstallAPI.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the entry point to clang-installapi; it is a wrapper +// for functionality in the InstallAPI clang library. +// +//===----------------------------------------------------------------------===// + +#include "Options.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Driver/Driver.h" +#include "clang/Driver/DriverDiagnostic.h" +#include "clang/Frontend/CompilerInstance.h" +#include "clang/Frontend/TextDiagnosticPrinter.h" +#include "clang/InstallAPI/Context.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/LLVMDriver.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Signals.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TextAPI/TextAPIWriter.h" + +using namespace clang; +using namespace clang::installapi; +using namespace clang::driver::options; +using namespace llvm::opt; +using namespace llvm::MachO; + +static bool run(ArrayRef Args, const char *ProgName) { + // Setup Diagnostics engine. + IntrusiveRefCntPtr DiagOpts = new DiagnosticOptions(); + const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable(); + unsigned MissingArgIndex, MissingArgCount; + llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs( + ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount); + ParseDiagnosticArgs(*DiagOpts, ParsedArgs); + + IntrusiveRefCntPtr Diag = new clang::DiagnosticsEngine( + new clang::DiagnosticIDs(), DiagOpts.get(), + new clang::TextDiagnosticPrinter(llvm::errs(), DiagOpts.get())); + + // Create file manager for all file operations. + IntrusiveRefCntPtr FM( + new FileManager(clang::FileSystemOptions())); + + // Set up driver to parse input arguments. + auto DriverArgs = llvm::ArrayRef(Args).slice(1); + clang::driver::Driver Driver(ProgName, llvm::sys::getDefaultTargetTriple(), + *Diag, "clang installapi tool"); + Driver.setInstalledDir(llvm::sys::path::parent_path(ProgName)); + auto TargetAndMode = + clang::driver::ToolChain::getTargetAndModeFromProgramName(ProgName); + Driver.setTargetAndMode(TargetAndMode); + bool HasError = false; + llvm::opt::InputArgList ArgList = + Driver.ParseArgStrings(DriverArgs, /*UseDriverMode=*/true, HasError); + if (HasError) + return EXIT_FAILURE; + Driver.setCheckInputsExist(false); + + // Capture InstallAPI specific options and diagnose any option errors. + Options Opts(*Diag, FM.get(), ArgList); + if (Diag->hasErrorOccurred()) + return EXIT_FAILURE; + InstallAPIContext Ctx = Opts.createContext(); + + // Set up compilation. + std::unique_ptr CI(new CompilerInstance()); + CI->setFileManager(FM.get()); + CI->createDiagnostics(); + if (!CI->hasDiagnostics()) + return EXIT_FAILURE; + + auto Out = CI->createOutputFile(Ctx.OutputLoc, /*Binary=*/false, + /*RemoveFileOnSignal=*/false, + /*UseTemporary=*/false, + /*CreateMissingDirectories=*/false); + if (!Out) + return EXIT_FAILURE; + + // Assign attributes for serialization. + InterfaceFile IF; + for (const auto &TargetInfo : Opts.DriverOpts.Targets) { + IF.addTarget(TargetInfo.first); + IF.setFromBinaryAttrs(Ctx.BA, TargetInfo.first); + } + + // Write output file and perform CI cleanup. + if (auto Err = TextAPIWriter::writeToStream(*Out, IF, Ctx.FT)) { + Diag->Report(diag::err_cannot_open_file) << Ctx.OutputLoc; + CI->clearOutputFiles(/*EraseFiles=*/true); + return EXIT_FAILURE; + } + + CI->clearOutputFiles(/*EraseFiles=*/false); + return EXIT_SUCCESS; +} + +int clang_installapi_main(int argc, char **argv, + const llvm::ToolContext &ToolContext) { + // Standard set up, so program fails gracefully. + llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); + llvm::PrettyStackTraceProgram StackPrinter(argc, argv); + llvm::llvm_shutdown_obj Shutdown; + + if (llvm::sys::Process::FixupStandardFileDescriptors()) + return EXIT_FAILURE; + + const char *ProgName = + ToolContext.NeedsPrependArg ? ToolContext.PrependArg : ToolContext.Path; + return run(llvm::ArrayRef(argv, argc), ProgName); +} diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp new file mode 100644 index 0000000000000..08d1c0e8e660f --- /dev/null +++ b/clang/tools/clang-installapi/Options.cpp @@ -0,0 +1,129 @@ +//===-- Options.cpp -------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Options.h" +#include "clang/Driver/Driver.h" +#include "clang/Frontend/FrontendDiagnostic.h" +#include "llvm/Support/Program.h" +#include "llvm/TargetParser/Host.h" + +using namespace clang::driver; +using namespace clang::driver::options; +using namespace llvm::opt; +using namespace llvm::MachO; + +namespace clang { +namespace installapi { + +bool Options::processDriverOptions(InputArgList &Args) { + // Handle inputs. + llvm::vfs::Status Stat; + for (const auto &Path : Args.getAllArgValues(OPT_INPUT)) { + if (FM->getNoncachedStatValue(Path, Stat) || !Stat.exists()) { + Diags->Report(clang::diag::err_drv_no_such_file) << Path; + return false; + } + DriverOpts.FileLists.push_back(std::move(Path)); + } + + // Handle output. + SmallString OutputPath; + if (auto *Arg = Args.getLastArg(OPT_o)) { + OutputPath = Arg->getValue(); + if (OutputPath != "-") + FM->makeAbsolutePath(OutputPath); + DriverOpts.OutputPath = std::string(OutputPath); + } + + // Do basic error checking first for mixing -target and -arch options. + auto *ArgArch = Args.getLastArgNoClaim(OPT_arch); + auto *ArgTarget = Args.getLastArgNoClaim(OPT_target); + auto *ArgTargetVariant = + Args.getLastArgNoClaim(OPT_darwin_target_variant_triple); + if (ArgArch && (ArgTarget || ArgTargetVariant)) { + Diags->Report(clang::diag::err_drv_argument_not_allowed_with) + << ArgArch->getAsString(Args) + << (ArgTarget ? ArgTarget : ArgTargetVariant)->getAsString(Args); + return false; + } + + auto *ArgMinTargetOS = Args.getLastArgNoClaim(OPT_mtargetos_EQ); + if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) { + Diags->Report(clang::diag::err_drv_cannot_mix_options) + << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args); + return false; + } + + // Capture target triples first. + if (ArgTarget) { + for (auto *Arg : Args.filtered(OPT_target)) { + llvm::Triple TargetTriple(Arg->getValue()); + Target TAPITarget = Target(TargetTriple); + if ((TAPITarget.Arch == AK_unknown) || + (TAPITarget.Platform == PLATFORM_UNKNOWN)) { + Diags->Report(clang::diag::err_drv_unsupported_opt_for_target) + << "installapi" << TargetTriple.str(); + return false; + } + DriverOpts.Targets[TAPITarget] = TargetTriple; + } + } + + return true; +} + +bool Options::processLinkerOptions(InputArgList &Args) { + // TODO: add error handling. + + // Required arguments. + if (const Arg *A = Args.getLastArg(options::OPT_install__name)) + LinkerOpts.InstallName = A->getValue(); + + // Defaulted or optional arguments. + if (auto *Arg = Args.getLastArg(OPT_current__version)) + LinkerOpts.CurrentVersion.parse64(Arg->getValue()); + + LinkerOpts.IsDylib = Args.hasArg(OPT_dynamiclib); + + LinkerOpts.AppExtensionSafe = + Args.hasFlag(OPT_fapplication_extension, OPT_fno_application_extension, + /*Default=*/LinkerOpts.AppExtensionSafe); + + if (::getenv("LD_NO_ENCRYPT") != nullptr) + LinkerOpts.AppExtensionSafe = true; + + if (::getenv("LD_APPLICATION_EXTENSION_SAFE") != nullptr) + LinkerOpts.AppExtensionSafe = true; + return true; +} + +Options::Options(DiagnosticsEngine &Diag, FileManager *FM, + InputArgList &ArgList) + : Diags(&Diag), FM(FM) { + if (!processDriverOptions(ArgList)) + return; + + if (!processLinkerOptions(ArgList)) + return; +} + +InstallAPIContext Options::createContext() { + InstallAPIContext Ctx; + // InstallAPI requires two level namespacing. + Ctx.BA.TwoLevelNamespace = true; + + Ctx.BA.InstallName = LinkerOpts.InstallName; + Ctx.BA.CurrentVersion = LinkerOpts.CurrentVersion; + Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe; + Ctx.FT = DriverOpts.OutFT; + Ctx.OutputLoc = DriverOpts.OutputPath; + return Ctx; +} + +} // namespace installapi +} // namespace clang diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h new file mode 100644 index 0000000000000..4a84166a6c91b --- /dev/null +++ b/clang/tools/clang-installapi/Options.h @@ -0,0 +1,88 @@ +//===--- clang-installapi/Options.h - Options -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_CLANG_INSTALLAPI_OPTIONS_H +#define LLVM_CLANG_TOOLS_CLANG_INSTALLAPI_OPTIONS_H + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Frontend/FrontendOptions.h" +#include "clang/InstallAPI/Context.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/Program.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/TextAPI/Architecture.h" +#include "llvm/TextAPI/InterfaceFile.h" +#include "llvm/TextAPI/PackedVersion.h" +#include "llvm/TextAPI/Platform.h" +#include "llvm/TextAPI/Target.h" +#include "llvm/TextAPI/Utils.h" +#include +#include +#include + +namespace clang { +namespace installapi { +using Macro = std::pair; + +struct DriverOptions { + /// \brief Path to input file lists (JSON). + llvm::MachO::PathSeq FileLists; + + /// \brief Mappings of target triples & tapi targets to build for. + std::map Targets; + + /// \brief Output path. + std::string OutputPath; + + /// \brief File encoding to print. + llvm::MachO::FileType OutFT = llvm::MachO::FileType::TBD_V5; +}; + +struct LinkerOptions { + /// \brief The install name to use for the dynamic library. + std::string InstallName; + + /// \brief The current version to use for the dynamic library. + llvm::MachO::PackedVersion CurrentVersion; + + /// \brief Is application extension safe. + bool AppExtensionSafe = false; + + /// \brief Set if we should scan for a dynamic library and not a framework. + bool IsDylib = false; +}; + +class Options { +private: + bool processDriverOptions(llvm::opt::InputArgList &Args); + bool processLinkerOptions(llvm::opt::InputArgList &Args); + +public: + /// The various options grouped together. + DriverOptions DriverOpts; + LinkerOptions LinkerOpts; + + Options() = delete; + + /// \brief Create InstallAPIContext from processed options. + InstallAPIContext createContext(); + + /// \brief Constructor for options. + Options(clang::DiagnosticsEngine &Diag, FileManager *FM, + llvm::opt::InputArgList &Args); + +private: + DiagnosticsEngine *Diags; + FileManager *FM; +}; + +} // namespace installapi +} // namespace clang +#endif From 2c30180f72b9c39dda1fd77bcbec8022e19bec23 Mon Sep 17 00:00:00 2001 From: Mogball Date: Wed, 21 Feb 2024 17:48:38 +0000 Subject: [PATCH 128/351] [coro] [async] Don't fail on targets that don't support tail calls --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index e6b7c9ae90945..90d40242ff2e4 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1854,9 +1854,7 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape, FnArgs, Builder); Builder.CreateRetVoid(); InlineFunctionInfo FnInfo; - auto InlineRes = InlineFunction(*TailCall, FnInfo); - assert(InlineRes.isSuccess() && "Expected inlining to succeed"); - (void)InlineRes; + (void)InlineFunction(*TailCall, FnInfo); // Replace the lvm.coro.async.resume intrisic call. replaceAsyncResumeFunction(Suspend, Continuation); From e50a231dcdd6aafa922b177b4fc4629bb7a10a79 Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Wed, 21 Feb 2024 14:51:37 -0300 Subject: [PATCH 129/351] [flang][OpenMP] Add support for copyprivate (#80485) Add initial handling of OpenMP copyprivate clause in Flang. When lowering copyprivate, Flang generates the copy function needed by each variable and builds the appropriate omp.single's CopyPrivateVarList. This is patch 3 of 4, to add support for COPYPRIVATE in Flang. Original PR: https://github.com/llvm/llvm-project/pull/73128 --- flang/include/flang/Lower/AbstractConverter.h | 3 + flang/lib/Lower/Bridge.cpp | 137 ++++++++------- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 156 +++++++++++++++++ flang/lib/Lower/OpenMP/ClauseProcessor.h | 4 + flang/lib/Lower/OpenMP/OpenMP.cpp | 17 +- flang/test/Lower/OpenMP/Todo/copyprivate.f90 | 13 -- flang/test/Lower/OpenMP/copyprivate.f90 | 164 ++++++++++++++++++ 7 files changed, 418 insertions(+), 76 deletions(-) delete mode 100644 flang/test/Lower/OpenMP/Todo/copyprivate.f90 create mode 100644 flang/test/Lower/OpenMP/copyprivate.f90 diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h index 796933a4eb5f6..e2af59e0aaa19 100644 --- a/flang/include/flang/Lower/AbstractConverter.h +++ b/flang/include/flang/Lower/AbstractConverter.h @@ -121,6 +121,9 @@ class AbstractConverter { const Fortran::semantics::Symbol &sym, mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) = 0; + virtual void copyVar(mlir::Location loc, mlir::Value dst, + mlir::Value src) = 0; + /// For a given symbol, check if it is present in the inner-most /// level of the symbol map. virtual bool isPresentShallowLookup(Fortran::semantics::Symbol &sym) = 0; diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 2d7f748cefa2d..83555e7cd82e7 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -744,6 +744,11 @@ class FirConverter : public Fortran::lower::AbstractConverter { }); } + void copyVar(mlir::Location loc, mlir::Value dst, + mlir::Value src) override final { + copyVarHLFIR(loc, dst, src); + } + void copyHostAssociateVar( const Fortran::semantics::Symbol &sym, mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr) override final { @@ -778,64 +783,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { rhs_sb = &hsb; } - mlir::Location loc = genLocation(sym.name()); - - if (lowerToHighLevelFIR()) { - hlfir::Entity lhs{lhs_sb->getAddr()}; - hlfir::Entity rhs{rhs_sb->getAddr()}; - // Temporary_lhs is set to true in hlfir.assign below to avoid user - // assignment to be used and finalization to be called on the LHS. - // This may or may not be correct but mimics the current behaviour - // without HLFIR. - auto copyData = [&](hlfir::Entity l, hlfir::Entity r) { - // Dereference RHS and load it if trivial scalar. - r = hlfir::loadTrivialScalar(loc, *builder, r); - builder->create( - loc, r, l, - /*isWholeAllocatableAssignment=*/false, - /*keepLhsLengthInAllocatableAssignment=*/false, - /*temporary_lhs=*/true); - }; - if (lhs.isAllocatable()) { - // Deep copy allocatable if it is allocated. - // Note that when allocated, the RHS is already allocated with the LHS - // shape for copy on entry in createHostAssociateVarClone. - // For lastprivate, this assumes that the RHS was not reallocated in - // the OpenMP region. - lhs = hlfir::derefPointersAndAllocatables(loc, *builder, lhs); - mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, lhs); - mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr); - builder->genIfThen(loc, isAllocated) - .genThen([&]() { - // Copy the DATA, not the descriptors. - copyData(lhs, rhs); - }) - .end(); - } else if (lhs.isPointer()) { - // Set LHS target to the target of RHS (do not copy the RHS - // target data into the LHS target storage). - auto loadVal = builder->create(loc, rhs); - builder->create(loc, loadVal, lhs); - } else { - // Non ALLOCATABLE/POINTER variable. Simple DATA copy. - copyData(lhs, rhs); - } - } else { - fir::ExtendedValue lhs = symBoxToExtendedValue(*lhs_sb); - fir::ExtendedValue rhs = symBoxToExtendedValue(*rhs_sb); - mlir::Type symType = genType(sym); - if (auto seqTy = symType.dyn_cast()) { - Fortran::lower::StatementContext stmtCtx; - Fortran::lower::createSomeArrayAssignment(*this, lhs, rhs, localSymbols, - stmtCtx); - stmtCtx.finalizeAndReset(); - } else if (lhs.getBoxOf()) { - fir::factory::CharacterExprHelper{*builder, loc}.createAssign(lhs, rhs); - } else { - auto loadVal = builder->create(loc, fir::getBase(rhs)); - builder->create(loc, loadVal, fir::getBase(lhs)); - } - } + copyVar(sym, *lhs_sb, *rhs_sb); if (copyAssignIP && copyAssignIP->isSet() && sym.test(Fortran::semantics::Symbol::Flag::OmpLastPrivate)) { @@ -1093,6 +1041,79 @@ class FirConverter : public Fortran::lower::AbstractConverter { return true; } + void copyVar(const Fortran::semantics::Symbol &sym, + const Fortran::lower::SymbolBox &lhs_sb, + const Fortran::lower::SymbolBox &rhs_sb) { + mlir::Location loc = genLocation(sym.name()); + if (lowerToHighLevelFIR()) + copyVarHLFIR(loc, lhs_sb.getAddr(), rhs_sb.getAddr()); + else + copyVarFIR(loc, sym, lhs_sb, rhs_sb); + } + + void copyVarHLFIR(mlir::Location loc, mlir::Value dst, mlir::Value src) { + assert(lowerToHighLevelFIR()); + hlfir::Entity lhs{dst}; + hlfir::Entity rhs{src}; + // Temporary_lhs is set to true in hlfir.assign below to avoid user + // assignment to be used and finalization to be called on the LHS. + // This may or may not be correct but mimics the current behaviour + // without HLFIR. + auto copyData = [&](hlfir::Entity l, hlfir::Entity r) { + // Dereference RHS and load it if trivial scalar. + r = hlfir::loadTrivialScalar(loc, *builder, r); + builder->create( + loc, r, l, + /*isWholeAllocatableAssignment=*/false, + /*keepLhsLengthInAllocatableAssignment=*/false, + /*temporary_lhs=*/true); + }; + if (lhs.isAllocatable()) { + // Deep copy allocatable if it is allocated. + // Note that when allocated, the RHS is already allocated with the LHS + // shape for copy on entry in createHostAssociateVarClone. + // For lastprivate, this assumes that the RHS was not reallocated in + // the OpenMP region. + lhs = hlfir::derefPointersAndAllocatables(loc, *builder, lhs); + mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, lhs); + mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr); + builder->genIfThen(loc, isAllocated) + .genThen([&]() { + // Copy the DATA, not the descriptors. + copyData(lhs, rhs); + }) + .end(); + } else if (lhs.isPointer()) { + // Set LHS target to the target of RHS (do not copy the RHS + // target data into the LHS target storage). + auto loadVal = builder->create(loc, rhs); + builder->create(loc, loadVal, lhs); + } else { + // Non ALLOCATABLE/POINTER variable. Simple DATA copy. + copyData(lhs, rhs); + } + } + + void copyVarFIR(mlir::Location loc, const Fortran::semantics::Symbol &sym, + const Fortran::lower::SymbolBox &lhs_sb, + const Fortran::lower::SymbolBox &rhs_sb) { + assert(!lowerToHighLevelFIR()); + fir::ExtendedValue lhs = symBoxToExtendedValue(lhs_sb); + fir::ExtendedValue rhs = symBoxToExtendedValue(rhs_sb); + mlir::Type symType = genType(sym); + if (auto seqTy = symType.dyn_cast()) { + Fortran::lower::StatementContext stmtCtx; + Fortran::lower::createSomeArrayAssignment(*this, lhs, rhs, localSymbols, + stmtCtx); + stmtCtx.finalizeAndReset(); + } else if (lhs.getBoxOf()) { + fir::factory::CharacterExprHelper{*builder, loc}.createAssign(lhs, rhs); + } else { + auto loadVal = builder->create(loc, fir::getBase(rhs)); + builder->create(loc, loadVal, fir::getBase(lhs)); + } + } + /// Map a block argument to a result or dummy symbol. This is not the /// definitive mapping. The specification expression have not been lowered /// yet. The final mapping will be done using this pre-mapping in diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 4e3951492fb65..a41f8312a28c9 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -609,6 +609,162 @@ bool ClauseProcessor::processCopyin() const { return hasCopyin; } +/// Class that extracts information from the specified type. +class TypeInfo { +public: + TypeInfo(mlir::Type ty) { typeScan(ty); } + + // Returns the length of character types. + std::optional getCharLength() const { + return charLen; + } + + // Returns the shape of array types. + const llvm::SmallVector &getShape() const { return shape; } + + // Is the type inside a box? + bool isBox() const { return inBox; } + +private: + void typeScan(mlir::Type type); + + std::optional charLen; + llvm::SmallVector shape; + bool inBox = false; +}; + +void TypeInfo::typeScan(mlir::Type ty) { + if (auto sty = mlir::dyn_cast(ty)) { + assert(shape.empty() && !sty.getShape().empty()); + shape = llvm::SmallVector(sty.getShape()); + typeScan(sty.getEleTy()); + } else if (auto bty = mlir::dyn_cast(ty)) { + inBox = true; + typeScan(bty.getEleTy()); + } else if (auto cty = mlir::dyn_cast(ty)) { + charLen = cty.getLen(); + } else if (auto hty = mlir::dyn_cast(ty)) { + typeScan(hty.getEleTy()); + } else if (auto pty = mlir::dyn_cast(ty)) { + typeScan(pty.getEleTy()); + } else { + // The scan ends when reaching any built-in or record type. + assert(ty.isIntOrIndexOrFloat() || mlir::isa(ty) || + mlir::isa(ty) || mlir::isa(ty)); + } +} + +// Create a function that performs a copy between two variables, compatible +// with their types and attributes. +static mlir::func::FuncOp +createCopyFunc(mlir::Location loc, Fortran::lower::AbstractConverter &converter, + mlir::Type varType, fir::FortranVariableFlagsEnum varAttrs) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + mlir::ModuleOp module = builder.getModule(); + mlir::Type eleTy = mlir::cast(varType).getEleTy(); + TypeInfo typeInfo(eleTy); + std::string copyFuncName = + fir::getTypeAsString(eleTy, builder.getKindMap(), "_copy"); + + if (auto decl = module.lookupSymbol(copyFuncName)) + return decl; + + // create function + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::OpBuilder modBuilder(module.getBodyRegion()); + llvm::SmallVector argsTy = {varType, varType}; + auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {}); + mlir::func::FuncOp funcOp = + modBuilder.create(loc, copyFuncName, funcType); + funcOp.setVisibility(mlir::SymbolTable::Visibility::Private); + builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy, + {loc, loc}); + builder.setInsertionPointToStart(&funcOp.getRegion().back()); + // generate body + fir::FortranVariableFlagsAttr attrs; + if (varAttrs != fir::FortranVariableFlagsEnum::None) + attrs = fir::FortranVariableFlagsAttr::get(builder.getContext(), varAttrs); + llvm::SmallVector typeparams; + if (typeInfo.getCharLength().has_value()) { + mlir::Value charLen = builder.createIntegerConstant( + loc, builder.getCharacterLengthType(), *typeInfo.getCharLength()); + typeparams.push_back(charLen); + } + mlir::Value shape; + if (!typeInfo.isBox() && !typeInfo.getShape().empty()) { + llvm::SmallVector extents; + for (auto extent : typeInfo.getShape()) + extents.push_back( + builder.createIntegerConstant(loc, builder.getIndexType(), extent)); + shape = builder.create(loc, extents); + } + auto declDst = builder.create(loc, funcOp.getArgument(0), + copyFuncName + "_dst", shape, + typeparams, attrs); + auto declSrc = builder.create(loc, funcOp.getArgument(1), + copyFuncName + "_src", shape, + typeparams, attrs); + converter.copyVar(loc, declDst.getBase(), declSrc.getBase()); + builder.create(loc); + return funcOp; +} + +bool ClauseProcessor::processCopyPrivate( + mlir::Location currentLocation, + llvm::SmallVectorImpl ©PrivateVars, + llvm::SmallVectorImpl ©PrivateFuncs) const { + auto addCopyPrivateVar = [&](Fortran::semantics::Symbol *sym) { + mlir::Value symVal = converter.getSymbolAddress(*sym); + auto declOp = symVal.getDefiningOp(); + if (!declOp) + fir::emitFatalError(currentLocation, + "COPYPRIVATE is supported only in HLFIR mode"); + symVal = declOp.getBase(); + mlir::Type symType = symVal.getType(); + fir::FortranVariableFlagsEnum attrs = + declOp.getFortranAttrs().has_value() + ? *declOp.getFortranAttrs() + : fir::FortranVariableFlagsEnum::None; + mlir::Value cpVar = symVal; + + // CopyPrivate variables must be passed by reference. However, in the case + // of assumed shapes/vla the type is not a !fir.ref, but a !fir.box. + // In these cases to retrieve the appropriate !fir.ref> to + // access the data we need we must perform an alloca and then store to it + // and retrieve the data from the new alloca. + if (mlir::isa(symType)) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + auto alloca = builder.create(currentLocation, symType); + builder.create(currentLocation, symVal, alloca); + cpVar = alloca; + } + + copyPrivateVars.push_back(cpVar); + mlir::func::FuncOp funcOp = + createCopyFunc(currentLocation, converter, cpVar.getType(), attrs); + copyPrivateFuncs.push_back(mlir::SymbolRefAttr::get(funcOp)); + }; + + bool hasCopyPrivate = findRepeatableClause( + [&](const ClauseTy::Copyprivate *copyPrivateClause, + const Fortran::parser::CharBlock &) { + const Fortran::parser::OmpObjectList &ompObjectList = + copyPrivateClause->v; + for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) { + Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject); + if (const auto *commonDetails = + sym->detailsIf()) { + for (const auto &mem : commonDetails->objects()) + addCopyPrivateVar(&*mem); + break; + } + addCopyPrivateVar(sym); + } + }); + + return hasCopyPrivate; +} + bool ClauseProcessor::processDepend( llvm::SmallVectorImpl &dependTypeOperands, llvm::SmallVectorImpl &dependOperands) const { diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 312255112605e..11aff0be25053 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -95,6 +95,10 @@ class ClauseProcessor { processAllocate(llvm::SmallVectorImpl &allocatorOperands, llvm::SmallVectorImpl &allocateOperands) const; bool processCopyin() const; + bool processCopyPrivate( + mlir::Location currentLocation, + llvm::SmallVectorImpl ©PrivateVars, + llvm::SmallVectorImpl ©PrivateFuncs) const; bool processDepend(llvm::SmallVectorImpl &dependTypeOperands, llvm::SmallVectorImpl &dependOperands) const; bool diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 89bd5ed080b20..7953bf83cba0f 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -25,6 +25,7 @@ #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/openmp-directive-sets.h" @@ -639,21 +640,26 @@ genSingleOp(Fortran::lower::AbstractConverter &converter, const Fortran::parser::OmpClauseList &endClauseList) { llvm::SmallVector allocateOperands, allocatorOperands; llvm::SmallVector copyPrivateVars; + llvm::SmallVector copyPrivateFuncs; mlir::UnitAttr nowaitAttr; ClauseProcessor cp(converter, semaCtx, beginClauseList); cp.processAllocate(allocatorOperands, allocateOperands); - cp.processTODO( - currentLocation, llvm::omp::Directive::OMPD_single); - ClauseProcessor(converter, semaCtx, endClauseList).processNowait(nowaitAttr); + ClauseProcessor ecp(converter, semaCtx, endClauseList); + ecp.processNowait(nowaitAttr); + ecp.processCopyPrivate(currentLocation, copyPrivateVars, copyPrivateFuncs); return genOpWithBody( OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) .setGenNested(genNested) .setClauses(&beginClauseList), allocateOperands, allocatorOperands, copyPrivateVars, - /*copyPrivateFuncs=*/nullptr, nowaitAttr); + copyPrivateFuncs.empty() + ? nullptr + : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(), + copyPrivateFuncs), + nowaitAttr); } static mlir::omp::TaskOp @@ -1689,7 +1695,8 @@ genOMP(Fortran::lower::AbstractConverter &converter, for (const auto &clause : endClauseList.v) { mlir::Location clauseLocation = converter.genLocation(clause.source); - if (!std::get_if(&clause.u)) + if (!std::get_if(&clause.u) && + !std::get_if(&clause.u)) TODO(clauseLocation, "OpenMP Block construct clause"); } diff --git a/flang/test/Lower/OpenMP/Todo/copyprivate.f90 b/flang/test/Lower/OpenMP/Todo/copyprivate.f90 deleted file mode 100644 index 0d871427ce60f..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/copyprivate.f90 +++ /dev/null @@ -1,13 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s - -! CHECK: not yet implemented: OpenMP Block construct clause -subroutine sb - integer, save :: a - !$omp threadprivate(a) - !$omp parallel - !$omp single - a = 3 - !$omp end single copyprivate(a) - !$omp end parallel -end subroutine diff --git a/flang/test/Lower/OpenMP/copyprivate.f90 b/flang/test/Lower/OpenMP/copyprivate.f90 new file mode 100644 index 0000000000000..9b76a996ef3e1 --- /dev/null +++ b/flang/test/Lower/OpenMP/copyprivate.f90 @@ -0,0 +1,164 @@ +! Test COPYPRIVATE. +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +!CHECK-DAG: func private @_copy_i64(%{{.*}}: !fir.ref, %{{.*}}: !fir.ref) +!CHECK-DAG: func private @_copy_f32(%{{.*}}: !fir.ref, %{{.*}}: !fir.ref) +!CHECK-DAG: func private @_copy_f64(%{{.*}}: !fir.ref, %{{.*}}: !fir.ref) +!CHECK-DAG: func private @_copy_z32(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_z64(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_l32(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_l64(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_c8x3(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_c8x8(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_c16x8(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) + +!CHECK-DAG: func private @_copy_box_Uxi32(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) +!CHECK-DAG: func private @_copy_10xi32(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_3x4xi32(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_10xf32(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_3x4xz32(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) +!CHECK-DAG: func private @_copy_10xl32(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) +!CHECK-DAG: func private @_copy_3xc8x8(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) +!CHECK-DAG: func private @_copy_3xc16x5(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) + +!CHECK-DAG: func private @_copy_rec__QFtest_dtTdt(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>) +!CHECK-DAG: func private @_copy_box_heap_Uxi32(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>>>) +!CHECK-DAG: func private @_copy_box_heap_i32(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) +!CHECK-DAG: func private @_copy_box_ptr_i32(%{{.*}}: !fir.ref>>, %{{.*}}: !fir.ref>>) +!CHECK-DAG: func private @_copy_box_ptr_Uxf32(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>>>) +!CHECK-DAG: func private @_copy_box_heap_Uxc8x5(%{{.*}}: !fir.ref>>>>, %{{.*}}: !fir.ref>>>>) +!CHECK-DAG: func private @_copy_box_ptr_Uxc8x9(%{{.*}}: !fir.ref>>>>, %{{.*}}: !fir.ref>>>>) + +!CHECK-LABEL: func private @_copy_i32( +!CHECK-SAME: %[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref) { +!CHECK-NEXT: %[[DST:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_copy_i32_dst"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK-NEXT: %[[SRC:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_copy_i32_src"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK-NEXT: %[[SRC_VAL:.*]] = fir.load %[[SRC]]#0 : !fir.ref +!CHECK-NEXT: hlfir.assign %[[SRC_VAL]] to %[[DST]]#0 temporary_lhs : i32, !fir.ref +!CHECK-NEXT: return +!CHECK-NEXT: } + +!CHECK-LABEL: func @_QPtest_tp +!CHECK: omp.parallel +!CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_tpEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[J:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_tpEj"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[K:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_tpEk"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: omp.single copyprivate(%[[I]]#0 -> @_copy_i32 : !fir.ref, %[[J]]#0 -> @_copy_i32 : !fir.ref, %[[K]]#0 -> @_copy_f32 : !fir.ref) +subroutine test_tp() + integer, save :: i, j + !$omp threadprivate(i, j) + real :: k + + k = 33.3 + !$omp parallel firstprivate(k) + !$omp single + i = 11 + j = 22 + !$omp end single copyprivate(i, j, k) + !$omp end parallel +end subroutine + +!CHECK-LABEL: func @_QPtest_scalar +!CHECK: omp.parallel +!CHECK: %[[I1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEi1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[I2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEi2"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[I3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEi3"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[R1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEr1"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[R2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEr2"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[C1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEc1"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[C2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEc2"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[L1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEl1"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[L2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEl2"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[S1:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEs1"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[S2:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEs2"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[S3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_scalarEs3"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) +!CHECK: omp.single copyprivate(%[[I1]]#0 -> @_copy_i32 : !fir.ref, %[[I2]]#0 -> @_copy_i64 : !fir.ref, %[[I3]]#0 -> @_copy_i64 : !fir.ref, %[[R1]]#0 -> @_copy_f32 : !fir.ref, %[[R2]]#0 -> @_copy_f64 : !fir.ref, %[[C1]]#0 -> @_copy_z32 : !fir.ref>, %[[C2]]#0 -> @_copy_z64 : !fir.ref>, %[[L1]]#0 -> @_copy_l32 : !fir.ref>, %[[L2]]#0 -> @_copy_l64 : !fir.ref>, %[[S1]]#0 -> @_copy_c8x3 : !fir.ref>, %[[S2]]#0 -> @_copy_c8x8 : !fir.ref>, %[[S3]]#0 -> @_copy_c16x8 : !fir.ref>) +subroutine test_scalar() + integer(4) :: i1 + integer(8) :: i2, i3 + real(4) :: r1 + real(8) :: r2 + complex(4) :: c1 + complex(8) :: c2 + logical(4) :: l1 + logical(8) :: l2 + character(kind=1, len=3) :: s1 + character(kind=1, len=8) :: s2 + character(kind=2, len=8) :: s3 + + !$omp parallel private(i1, i2, i3, r1, r2, c1, c2, l1, l2, s1, s2, s3) + !$omp single + !$omp end single copyprivate(i1, i2, i3, r1, r2, c1, c2, l1, l2, s1, s2, s3) + !$omp end parallel +end subroutine + +!CHECK-LABEL: func @_QPtest_array +!CHECK: omp.parallel +!CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +!CHECK: %[[I1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEi1"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[I2:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEi2"} : (!fir.ref>, !fir.shape<2>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[I3:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEi3"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +!CHECK: %[[R1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEr1"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[C1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEc1"} : (!fir.ref>>, !fir.shape<2>) -> (!fir.ref>>, !fir.ref>>) +!CHECK: %[[L1:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFtest_arrayEl1"} : (!fir.ref>>, !fir.shape<1>) -> (!fir.ref>>, !fir.ref>>) +!CHECK: %[[S1:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_arrayEs1"} : (!fir.ref>>, !fir.shape<1>, index) -> (!fir.ref>>, !fir.ref>>) +!CHECK: %[[S2:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_arrayEs2"} : (!fir.ref>>, !fir.shape<1>, index) -> (!fir.ref>>, !fir.ref>>) +!CHECK: %[[A_REF:.*]] = fir.alloca !fir.box> +!CHECK: fir.store %[[A]]#0 to %[[A_REF]] : !fir.ref>> +!CHECK: %[[I3_REF:.*]] = fir.alloca !fir.box> +!CHECK: fir.store %[[I3]]#0 to %[[I3_REF]] : !fir.ref>> +!CHECK: omp.single copyprivate(%[[A_REF]] -> @_copy_box_Uxi32 : !fir.ref>>, %[[I1]]#0 -> @_copy_10xi32 : !fir.ref>, %[[I2]]#0 -> @_copy_3x4xi32 : !fir.ref>, %[[I3_REF]] -> @_copy_box_Uxi32 : !fir.ref>>, %[[R1]]#0 -> @_copy_10xf32 : !fir.ref>, %[[C1]]#0 -> @_copy_3x4xz32 : !fir.ref>>, %[[L1]]#0 -> @_copy_10xl32 : !fir.ref>>, %[[S1]]#0 -> @_copy_3xc8x8 : !fir.ref>>, %[[S2]]#0 -> @_copy_3xc16x5 : !fir.ref>>) +subroutine test_array(a, n) + integer :: a(:), n + integer :: i1(10), i2(3, 4), i3(n) + real :: r1(10) + complex :: c1(3, 4) + logical :: l1(10) + character(8) :: s1(3) + character(kind=2, len=5) :: s2(3) + + !$omp parallel private(a, i1, i2, i3, r1, c1, l1, s1, s2) + !$omp single + !$omp end single copyprivate(a, i1, i2, i3, r1, c1, l1, s1, s2) + !$omp end parallel +end subroutine + +!CHECK-LABEL: func @_QPtest_dt +!CHECK: omp.parallel +!CHECK: %[[T:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_dtEt"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +!CHECK: omp.single copyprivate(%[[T]]#0 -> @_copy_rec__QFtest_dtTdt : !fir.ref>) +subroutine test_dt() + type dt + integer :: i + real :: r + end type + type(dt) :: t + + !$omp parallel private(t) + !$omp single + !$omp end single copyprivate(t) + !$omp end parallel +end subroutine + +!CHECK-LABEL: func @_QPtest_attr +!CHECK: omp.parallel +!CHECK: %[[I1:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_attrEi1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +!CHECK: %[[I2:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_attrEi2"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +!CHECK: %[[I3:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_attrEi3"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +!CHECK: %[[R1:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_attrEr1"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +!CHECK: %[[C1:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_attrEc1"} : (!fir.ref>>>>) -> (!fir.ref>>>>, !fir.ref>>>>) +!CHECK: %[[C2:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_attrEc2"} : (!fir.ref>>>>) -> (!fir.ref>>>>, !fir.ref>>>>) +!CHECK: omp.single copyprivate(%[[I1]]#0 -> @_copy_box_heap_Uxi32 : !fir.ref>>>, %[[I2:.*]]#0 -> @_copy_box_heap_i32 : !fir.ref>>, %[[I3]]#0 -> @_copy_box_ptr_i32 : !fir.ref>>, %[[R1]]#0 -> @_copy_box_ptr_Uxf32 : !fir.ref>>>, %[[C1]]#0 -> @_copy_box_heap_Uxc8x5 : !fir.ref>>>>, %[[C2]]#0 -> @_copy_box_ptr_Uxc8x9 : !fir.ref>>>>) +subroutine test_attr() + integer, allocatable :: i1(:) + integer, allocatable :: i2 + integer, pointer :: i3 + real, pointer :: r1(:) + character(kind=1, len=5), allocatable :: c1(:) + character(kind=1, len=9), pointer :: c2(:) + + !$omp parallel private(i1, i2, i3, r1, c1, c2) + !$omp single + !$omp end single copyprivate(i1, i2, i3, r1, c1, c2) + !$omp end parallel +end subroutine From 579ae446375b2bec6d329b612adfa0a74f7126e3 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Wed, 21 Feb 2024 10:01:58 -0800 Subject: [PATCH 130/351] [InstallAPI] Add missing clangBasic library dependency Appeases CI: https://lab.llvm.org/buildbot/#/builders/268/builds/8581/steps/5/logs/stdio --- clang/tools/clang-installapi/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt index c8dd56db101da..0ff78ce2fcead 100644 --- a/clang/tools/clang-installapi/CMakeLists.txt +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -14,6 +14,7 @@ add_clang_tool(clang-installapi clang_target_link_libraries(clang-installapi PRIVATE clangInstallAPI + clangBasic clangDriver clangFrontend clangTooling From e7bfe414a6abee31a8c83afbc8206e6249dd837d Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Wed, 21 Feb 2024 10:09:55 -0800 Subject: [PATCH 131/351] [InstallAPI] Add additional missing library dependency Fixes: https://lab.llvm.org/buildbot/#/builders/268/builds/8583 --- clang/tools/clang-installapi/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt index 0ff78ce2fcead..f0fe0b22aaa20 100644 --- a/clang/tools/clang-installapi/CMakeLists.txt +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -18,4 +18,5 @@ clang_target_link_libraries(clang-installapi clangDriver clangFrontend clangTooling + clangSerialization ) From c5bbf979ada59e4ef9e67cb8bae59522d17b5140 Mon Sep 17 00:00:00 2001 From: Nick Anderson Date: Wed, 21 Feb 2024 10:12:03 -0800 Subject: [PATCH 132/351] [AMDGPU] fixes mistake in #82018 (#82223) fixes #81766 #82018 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5e1d750850374..257dff6ef6839 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6306,7 +6306,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || - VT == MVT::v32f16) + VT == MVT::v16bf16) return splitBinaryVectorOp(Op, DAG); return Op; } From 2167881f5154823dc3183845700add7df15fc856 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 21 Feb 2024 10:13:25 -0800 Subject: [PATCH 133/351] [ARM,MC] Support FDPIC relocations Linux kernel fs/binfmt_elf_fdpic.c supports FDPIC for MMU-less systems. GCC/binutils/qemu support FDPIC ABI for ARM (https://github.com/mickael-guene/fdpic_doc). _ARM FDPIC Toolchain and ABI_ provides a summary. This patch implements FDPIC relocations to the integrated assembler. There are 6 static relocations and 2 dynamic relocations, with R_ARM_FUNCDESC as both static and dynamic. gas requires `--fdpic` to assemble data relocations like `.word f(FUNCDESC)`. This patch adds `MCTargetOptions::FDPIC` and reports an error if FDPIC is not set. Pull Request: https://github.com/llvm/llvm-project/pull/82187 --- llvm/include/llvm/BinaryFormat/ELF.h | 1 + .../llvm/BinaryFormat/ELFRelocs/ARM.def | 7 ++++ llvm/include/llvm/MC/MCExpr.h | 6 ++++ .../llvm/MC/MCParser/MCTargetAsmParser.h | 4 +++ llvm/include/llvm/MC/MCTargetOptions.h | 1 + .../llvm/MC/MCTargetOptionsCommandFlags.h | 2 ++ llvm/lib/MC/MCExpr.cpp | 15 ++++---- llvm/lib/MC/MCParser/AsmParser.cpp | 2 +- llvm/lib/MC/MCTargetOptions.cpp | 2 +- llvm/lib/MC/MCTargetOptionsCommandFlags.cpp | 5 +++ llvm/lib/ObjectYAML/ELFYAML.cpp | 1 + .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 34 +++++++++++++++++++ .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 5 ++- .../ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 21 ++++++++++++ .../Target/ARM/MCTargetDesc/CMakeLists.txt | 1 + llvm/test/MC/ARM/fdpic.s | 33 ++++++++++++++++++ .../llvm-readobj/ELF/file-header-os-abi.test | 7 ++++ .../llvm-readobj/ELF/reloc-types-arm.test | 14 ++++++++ llvm/tools/llvm-readobj/ELFDumper.cpp | 3 +- 19 files changed, 153 insertions(+), 11 deletions(-) create mode 100644 llvm/test/MC/ARM/fdpic.s diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 124bba76c1774..bace3a92677a8 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -362,6 +362,7 @@ enum { ELFOSABI_AMDGPU_PAL = 65, // AMD PAL runtime ELFOSABI_AMDGPU_MESA3D = 66, // AMD GCN GPUs (GFX6+) for MESA runtime ELFOSABI_ARM = 97, // ARM + ELFOSABI_ARM_FDPIC = 65, // ARM FDPIC ELFOSABI_C6000_ELFABI = 64, // Bare-metal TMS320C6000 ELFOSABI_C6000_LINUX = 65, // Linux TMS320C6000 ELFOSABI_STANDALONE = 255, // Standalone (embedded) application diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/ARM.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/ARM.def index 47084d1eb0aad..7e9fe965241f2 100644 --- a/llvm/include/llvm/BinaryFormat/ELFRelocs/ARM.def +++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/ARM.def @@ -143,3 +143,10 @@ ELF_RELOC(R_ARM_THM_BF16, 0x88) ELF_RELOC(R_ARM_THM_BF12, 0x89) ELF_RELOC(R_ARM_THM_BF18, 0x8a) ELF_RELOC(R_ARM_IRELATIVE, 0xa0) +ELF_RELOC(R_ARM_GOTFUNCDESC, 0xa1) +ELF_RELOC(R_ARM_GOTOFFFUNCDESC, 0xa2) +ELF_RELOC(R_ARM_FUNCDESC, 0xa3) +ELF_RELOC(R_ARM_FUNCDESC_VALUE, 0xa4) +ELF_RELOC(R_ARM_TLS_GD32_FDPIC, 0xa5) +ELF_RELOC(R_ARM_TLS_LDM32_FDPIC, 0xa6) +ELF_RELOC(R_ARM_TLS_IE32_FDPIC, 0xa7) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 67836292874f5..b311960937204 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -223,6 +223,12 @@ class MCSymbolRefExpr : public MCExpr { VK_SECREL, VK_SIZE, // symbol@SIZE VK_WEAKREF, // The link between the symbols in .weakref foo, bar + VK_FUNCDESC, + VK_GOTFUNCDESC, + VK_GOTOFFFUNCDESC, + VK_TLSGD_FDPIC, + VK_TLSLDM_FDPIC, + VK_GOTTPOFF_FDPIC, VK_X86_ABS8, VK_X86_PLTOFF, diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h index fe905f2c3ba5f..7edd3f8ce4904 100644 --- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h @@ -525,6 +525,10 @@ class MCTargetAsmParser : public MCAsmParserExtension { // Return whether this parser accept star as start of statement virtual bool starIsStartOfStatement() { return false; }; + virtual MCSymbolRefExpr::VariantKind + getVariantKindForName(StringRef Name) const { + return MCSymbolRefExpr::getVariantKindForName(Name); + } virtual const MCExpr *applyModifierToExpr(const MCExpr *E, MCSymbolRefExpr::VariantKind, MCContext &Ctx) { diff --git a/llvm/include/llvm/MC/MCTargetOptions.h b/llvm/include/llvm/MC/MCTargetOptions.h index e2dd1e0433dbe..a7295879e15f0 100644 --- a/llvm/include/llvm/MC/MCTargetOptions.h +++ b/llvm/include/llvm/MC/MCTargetOptions.h @@ -51,6 +51,7 @@ class MCTargetOptions { bool MCNoTypeCheck : 1; bool MCSaveTempLabels : 1; bool MCIncrementalLinkerCompatible : 1; + bool FDPIC : 1; bool ShowMCEncoding : 1; bool ShowMCInst : 1; bool AsmVerbose : 1; diff --git a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h index 7f6ee6c8be224..ba3784cab5b11 100644 --- a/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h +++ b/llvm/include/llvm/MC/MCTargetOptionsCommandFlags.h @@ -29,6 +29,8 @@ std::optional getExplicitRelaxAll(); bool getIncrementalLinkerCompatible(); +bool getFDPIC(); + int getDwarfVersion(); bool getDwarf64(); diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 80def6dfc24b1..485fd1885ddb5 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -220,6 +220,7 @@ const MCSymbolRefExpr *MCSymbolRefExpr::create(StringRef Name, VariantKind Kind, StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { switch (Kind) { + // clang-format off case VK_Invalid: return "<>"; case VK_None: return "<>"; @@ -232,13 +233,16 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_GOTPCREL: return "GOTPCREL"; case VK_GOTPCREL_NORELAX: return "GOTPCREL_NORELAX"; case VK_GOTTPOFF: return "GOTTPOFF"; + case VK_GOTTPOFF_FDPIC: return "gottpoff_fdpic"; case VK_INDNTPOFF: return "INDNTPOFF"; case VK_NTPOFF: return "NTPOFF"; case VK_GOTNTPOFF: return "GOTNTPOFF"; case VK_PLT: return "PLT"; case VK_TLSGD: return "TLSGD"; + case VK_TLSGD_FDPIC: return "tlsgd_fdpic"; case VK_TLSLD: return "TLSLD"; case VK_TLSLDM: return "TLSLDM"; + case VK_TLSLDM_FDPIC: return "tlsldm_fdpic"; case VK_TPOFF: return "TPOFF"; case VK_TPREL: return "TPREL"; case VK_TLSCALL: return "tlscall"; @@ -253,6 +257,9 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_SECREL: return "SECREL32"; case VK_SIZE: return "SIZE"; case VK_WEAKREF: return "WEAKREF"; + case VK_FUNCDESC: return "FUNCDESC"; + case VK_GOTFUNCDESC: return "GOTFUNCDESC"; + case VK_GOTOFFFUNCDESC: return "GOTOFFFUNCDESC"; case VK_X86_ABS8: return "ABS8"; case VK_X86_PLTOFF: return "PLTOFF"; case VK_ARM_NONE: return "none"; @@ -386,6 +393,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_VE_TLS_GD_LO32: return "tls_gd_lo"; case VK_VE_TPOFF_HI32: return "tpoff_hi"; case VK_VE_TPOFF_LO32: return "tpoff_lo"; + // clang-format on } llvm_unreachable("Invalid variant kind"); } @@ -493,13 +501,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) { .Case("ie", VK_Hexagon_IE) .Case("ldgot", VK_Hexagon_LD_GOT) .Case("ldplt", VK_Hexagon_LD_PLT) - .Case("none", VK_ARM_NONE) - .Case("got_prel", VK_ARM_GOT_PREL) - .Case("target1", VK_ARM_TARGET1) - .Case("target2", VK_ARM_TARGET2) - .Case("prel31", VK_ARM_PREL31) - .Case("sbrel", VK_ARM_SBREL) - .Case("tlsldo", VK_ARM_TLSLDO) .Case("lo8", VK_AVR_LO8) .Case("hi8", VK_AVR_HI8) .Case("hlo8", VK_AVR_HLO8) diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 8e508dbdb1c69..a1c32eee32864 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1237,7 +1237,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, // Lookup the symbol variant if used. if (!Split.second.empty()) { - Variant = MCSymbolRefExpr::getVariantKindForName(Split.second); + Variant = getTargetParser().getVariantKindForName(Split.second); if (Variant != MCSymbolRefExpr::VK_Invalid) { SymbolName = Split.first; } else if (MAI.doesAllowAtInName() && !MAI.useParensForSymbolVariant()) { diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp index 07c6e752cb613..bff4b8da2fb1b 100644 --- a/llvm/lib/MC/MCTargetOptions.cpp +++ b/llvm/lib/MC/MCTargetOptions.cpp @@ -15,7 +15,7 @@ MCTargetOptions::MCTargetOptions() : MCRelaxAll(false), MCNoExecStack(false), MCFatalWarnings(false), MCNoWarn(false), MCNoDeprecatedWarn(false), MCNoTypeCheck(false), MCSaveTempLabels(false), MCIncrementalLinkerCompatible(false), - ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), + FDPIC(false), ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), PreserveAsmComments(true), Dwarf64(false), EmitDwarfUnwind(EmitDwarfUnwindType::Default), MCUseDwarfDirectory(DefaultDwarfDirectory), diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp index 8a4923e4792fb..fb8334d626cb8 100644 --- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp +++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp @@ -36,6 +36,7 @@ using namespace llvm; MCOPT_EXP(bool, RelaxAll) MCOPT(bool, IncrementalLinkerCompatible) +MCOPT(bool, FDPIC) MCOPT(int, DwarfVersion) MCOPT(bool, Dwarf64) MCOPT(EmitDwarfUnwindType, EmitDwarfUnwind) @@ -66,6 +67,9 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() { "emit an object file which can be used with an incremental linker")); MCBINDOPT(IncrementalLinkerCompatible); + static cl::opt FDPIC("fdpic", cl::desc("Use the FDPIC ABI")); + MCBINDOPT(FDPIC); + static cl::opt DwarfVersion("dwarf-version", cl::desc("Dwarf version"), cl::init(0)); MCBINDOPT(DwarfVersion); @@ -135,6 +139,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() { MCTargetOptions Options; Options.MCRelaxAll = getRelaxAll(); Options.MCIncrementalLinkerCompatible = getIncrementalLinkerCompatible(); + Options.FDPIC = getFDPIC(); Options.Dwarf64 = getDwarf64(); Options.DwarfVersion = getDwarfVersion(); Options.ShowMCInst = getShowMCInst(); diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index de1ef2458152c..9c1a28db592a1 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -406,6 +406,7 @@ void ScalarEnumerationTraits::enumeration( ECase(ELFOSABI_AMDGPU_PAL); ECase(ELFOSABI_AMDGPU_MESA3D); ECase(ELFOSABI_ARM); + ECase(ELFOSABI_ARM_FDPIC); ECase(ELFOSABI_C6000_ELFABI); ECase(ELFOSABI_C6000_LINUX); ECase(ELFOSABI_STANDALONE); diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index c82ab57bdf80f..37bfb76a494de 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -737,6 +737,9 @@ class ARMAsmParser : public MCTargetAsmParser { void ReportNearMisses(SmallVectorImpl &NearMisses, SMLoc IDLoc, OperandVector &Operands); + MCSymbolRefExpr::VariantKind + getVariantKindForName(StringRef Name) const override; + void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override; void onLabelParsed(MCSymbol *Symbol) override; @@ -11358,6 +11361,37 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) { return false; } +MCSymbolRefExpr::VariantKind +ARMAsmParser::getVariantKindForName(StringRef Name) const { + return StringSwitch(Name.lower()) + .Case("funcdesc", MCSymbolRefExpr::VK_FUNCDESC) + .Case("got", MCSymbolRefExpr::VK_GOT) + .Case("got_prel", MCSymbolRefExpr::VK_ARM_GOT_PREL) + .Case("gotfuncdesc", MCSymbolRefExpr::VK_GOTFUNCDESC) + .Case("gotoff", MCSymbolRefExpr::VK_GOTOFF) + .Case("gotofffuncdesc", MCSymbolRefExpr::VK_GOTOFFFUNCDESC) + .Case("gottpoff", MCSymbolRefExpr::VK_GOTTPOFF) + .Case("gottpoff_fdpic", MCSymbolRefExpr::VK_GOTTPOFF_FDPIC) + .Case("imgrel", MCSymbolRefExpr::VK_COFF_IMGREL32) + .Case("none", MCSymbolRefExpr::VK_ARM_NONE) + .Case("plt", MCSymbolRefExpr::VK_PLT) + .Case("prel31", MCSymbolRefExpr::VK_ARM_PREL31) + .Case("sbrel", MCSymbolRefExpr::VK_ARM_SBREL) + .Case("secrel32", MCSymbolRefExpr::VK_SECREL) + .Case("target1", MCSymbolRefExpr::VK_ARM_TARGET1) + .Case("target2", MCSymbolRefExpr::VK_ARM_TARGET2) + .Case("tlscall", MCSymbolRefExpr::VK_TLSCALL) + .Case("tlsdesc", MCSymbolRefExpr::VK_TLSDESC) + .Case("tlsgd", MCSymbolRefExpr::VK_TLSGD) + .Case("tlsgd_fdpic", MCSymbolRefExpr::VK_TLSGD_FDPIC) + .Case("tlsld", MCSymbolRefExpr::VK_TLSLD) + .Case("tlsldm", MCSymbolRefExpr::VK_TLSLDM) + .Case("tlsldm_fdpic", MCSymbolRefExpr::VK_TLSLDM_FDPIC) + .Case("tlsldo", MCSymbolRefExpr::VK_ARM_TLSLDO) + .Case("tpoff", MCSymbolRefExpr::VK_TPOFF) + .Default(MCSymbolRefExpr::VK_Invalid); +} + void ARMAsmParser::doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) { // We need to flush the current implicit IT block on a label, because it is // not legal to branch into an IT block. diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 1d17bb349f24b..6cd4badb7704b 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -29,6 +29,7 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/Debug.h" #include "llvm/Support/EndianStream.h" @@ -1349,7 +1350,9 @@ static MCAsmBackend *createARMAsmBackend(const Target &T, return new ARMAsmBackendWinCOFF(T, STI.getTargetTriple().isThumb()); case Triple::ELF: assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target"); - uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + uint8_t OSABI = Options.FDPIC + ? ELF::ELFOSABI_ARM_FDPIC + : MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); return new ARMAsmBackendELF(T, STI.getTargetTriple().isThumb(), OSABI, Endian); } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 44695a86c4e36..de7449a400a74 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" +#include "llvm/Object/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include @@ -84,6 +85,14 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + auto CheckFDPIC = [&](uint32_t Type) { + if (getOSABI() != ELF::ELFOSABI_ARM_FDPIC) + Ctx.reportError(Fixup.getLoc(), + "relocation " + + object::getELFRelocationTypeName(ELF::EM_ARM, Type) + + " only supported in FDPIC mode"); + return Type; + }; if (IsPCRel) { switch (Fixup.getTargetKind()) { @@ -240,6 +249,18 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, return ELF::R_ARM_TLS_LDM32; case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ: return ELF::R_ARM_TLS_DESCSEQ; + case MCSymbolRefExpr::VK_FUNCDESC: + return CheckFDPIC(ELF::R_ARM_FUNCDESC); + case MCSymbolRefExpr::VK_GOTFUNCDESC: + return CheckFDPIC(ELF::R_ARM_GOTFUNCDESC); + case MCSymbolRefExpr::VK_GOTOFFFUNCDESC: + return CheckFDPIC(ELF::R_ARM_GOTOFFFUNCDESC); + case MCSymbolRefExpr::VK_TLSGD_FDPIC: + return CheckFDPIC(ELF::R_ARM_TLS_GD32_FDPIC); + case MCSymbolRefExpr::VK_TLSLDM_FDPIC: + return CheckFDPIC(ELF::R_ARM_TLS_LDM32_FDPIC); + case MCSymbolRefExpr::VK_GOTTPOFF_FDPIC: + return CheckFDPIC(ELF::R_ARM_TLS_IE32_FDPIC); } case ARM::fixup_arm_condbranch: case ARM::fixup_arm_uncondbranch: diff --git a/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt index 687d9a9e918db..8b3ef0ee651e5 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt +++ b/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_component_library(LLVMARMDesc CodeGenTypes MC MCDisassembler + Object Support TargetParser diff --git a/llvm/test/MC/ARM/fdpic.s b/llvm/test/MC/ARM/fdpic.s new file mode 100644 index 0000000000000..1dfedc59ada61 --- /dev/null +++ b/llvm/test/MC/ARM/fdpic.s @@ -0,0 +1,33 @@ +# RUN: llvm-mc -triple=armv7-linux-gnueabi %s | FileCheck %s --check-prefix=ASM +# RUN: llvm-mc -filetype=obj -triple=armv7-linux-gnueabi --fdpic %s | llvm-readelf -h -r - | FileCheck %s + +# RUN: not llvm-mc -filetype=obj -triple=armv7-linux-gnueabi %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR + +# ASM: .long f(FUNCDESC) +# ASM-NEXT: .long f(GOTFUNCDESC) +# ASM-NEXT: .long f(GOTOFFFUNCDESC) + +# CHECK: OS/ABI: ARM FDPIC +# CHECK: Machine: ARM +# CHECK: Flags: 0x5000000 + +# CHECK: R_ARM_FUNCDESC 00000000 f +# CHECK-NEXT: R_ARM_GOTFUNCDESC 00000000 f +# CHECK-NEXT: R_ARM_GOTOFFFUNCDESC 00000000 f +# CHECK-NEXT: R_ARM_TLS_GD32_FDPIC 00000000 tls +# CHECK-NEXT: R_ARM_TLS_LDM32_FDPIC 00000000 tls +# CHECK-NEXT: R_ARM_TLS_IE32_FDPIC 00000000 tls + +.data +# ERR: [[#@LINE+1]]:7: error: relocation R_ARM_FUNCDESC only supported in FDPIC mode +.long f(FUNCDESC) +# ERR: [[#@LINE+1]]:7: error: relocation R_ARM_GOTFUNCDESC only supported in FDPIC mode +.long f(GOTFUNCDESC) +# ERR: [[#@LINE+1]]:7: error: relocation R_ARM_GOTOFFFUNCDESC only supported in FDPIC mode +.long f(GOTOFFFUNCDESC) +# ERR: [[#@LINE+1]]:7: error: relocation R_ARM_TLS_GD32_FDPIC only supported in FDPIC mode +.long tls(tlsgd_fdpic) +# ERR: [[#@LINE+1]]:7: error: relocation R_ARM_TLS_LDM32_FDPIC only supported in FDPIC mode +.long tls(tlsldm_fdpic) +# ERR: [[#@LINE+1]]:7: error: relocation R_ARM_TLS_IE32_FDPIC only supported in FDPIC mode +.long tls(gottpoff_fdpic) diff --git a/llvm/test/tools/llvm-readobj/ELF/file-header-os-abi.test b/llvm/test/tools/llvm-readobj/ELF/file-header-os-abi.test index eb60d2a021af6..a48346d6b9c85 100644 --- a/llvm/test/tools/llvm-readobj/ELF/file-header-os-abi.test +++ b/llvm/test/tools/llvm-readobj/ELF/file-header-os-abi.test @@ -192,6 +192,13 @@ FileHeader: # OSABI-ARM-LLVM: OS/ABI: ARM (0x61) # OSABI-ARM-GNU: OS/ABI: ARM +# RUN: yaml2obj %s -DOSABI=ELFOSABI_ARM_FDPIC -DMACHINE=EM_ARM -o %t.osabi.arm_fdpic +# RUN: llvm-readobj --file-headers %t.osabi.arm_fdpic | FileCheck %s --match-full-lines --check-prefix=OSABI-ARMFDPIC-LLVM +# RUN: llvm-readelf --file-headers %t.osabi.arm_fdpic | FileCheck %s --match-full-lines --check-prefix=OSABI-ARMFDPIC-GNU + +# OSABI-ARMFDPIC-LLVM: OS/ABI: ARM FDPIC (0x41) +# OSABI-ARMFDPIC-GNU: OS/ABI: ARM FDPIC + ## Check all EM_TI_C6000 specific values. # RUN: yaml2obj %s -DOSABI=ELFOSABI_C6000_ELFABI -DMACHINE=EM_TI_C6000 -o %t.osabi.c6000.elfabi diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-types-arm.test b/llvm/test/tools/llvm-readobj/ELF/reloc-types-arm.test index 96d6cfed4df3e..dafe01ba36afb 100644 --- a/llvm/test/tools/llvm-readobj/ELF/reloc-types-arm.test +++ b/llvm/test/tools/llvm-readobj/ELF/reloc-types-arm.test @@ -135,6 +135,13 @@ # CHECK: Type: R_ARM_THM_TLS_DESCSEQ16 (129) # CHECK: Type: R_ARM_THM_TLS_DESCSEQ32 (130) # CHECK: Type: R_ARM_IRELATIVE (160) +# CHECK: Type: R_ARM_GOTFUNCDESC (161) +# CHECK: Type: R_ARM_GOTOFFFUNCDESC (162) +# CHECK: Type: R_ARM_FUNCDESC (163) +# CHECK: Type: R_ARM_FUNCDESC_VALUE (164) +# CHECK: Type: R_ARM_TLS_GD32_FDPIC (165) +# CHECK: Type: R_ARM_TLS_LDM32_FDPIC (166) +# CHECK: Type: R_ARM_TLS_IE32_FDPIC (167) --- !ELF FileHeader: @@ -278,3 +285,10 @@ Sections: - Type: R_ARM_THM_TLS_DESCSEQ16 - Type: R_ARM_THM_TLS_DESCSEQ32 - Type: R_ARM_IRELATIVE + - Type: R_ARM_GOTFUNCDESC + - Type: R_ARM_GOTOFFFUNCDESC + - Type: R_ARM_FUNCDESC + - Type: R_ARM_FUNCDESC_VALUE + - Type: R_ARM_TLS_GD32_FDPIC + - Type: R_ARM_TLS_LDM32_FDPIC + - Type: R_ARM_TLS_IE32_FDPIC diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index f937a074649a1..4be678df44125 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1095,7 +1095,8 @@ const EnumEntry AMDGPUElfOSABI[] = { }; const EnumEntry ARMElfOSABI[] = { - {"ARM", "ARM", ELF::ELFOSABI_ARM} + {"ARM", "ARM", ELF::ELFOSABI_ARM}, + {"ARM FDPIC", "ARM FDPIC", ELF::ELFOSABI_ARM_FDPIC}, }; const EnumEntry C6000ElfOSABI[] = { From 5488e3ea4869b32e82f7f7dfb3ba22cb7b5b9436 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Wed, 21 Feb 2024 10:20:52 -0800 Subject: [PATCH 134/351] [InstallAPI] add explicit dependency on llvmOption library --- clang/tools/clang-installapi/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt index f0fe0b22aaa20..b8384c92c104f 100644 --- a/clang/tools/clang-installapi/CMakeLists.txt +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS Support TargetParser TextAPI + Option ) add_clang_tool(clang-installapi From 6f0e39c4239bb4c0980a048e264f61bded147388 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 18:17:40 +0000 Subject: [PATCH 135/351] [NVPTX] Set ISD::FP_{EXTEND,ROUND} to Custom for more types Sometimes those nodes are queried with the non-bf16. We need to request to SDAG that we want to handle the non-bf16 side so that the handler can detect if bf16 is being used on either side. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ef3b61fbd0dea..63920a1db25c8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -780,8 +780,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); } if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { - setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); - setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom); + for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) { + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::FP_ROUND, VT, Custom); + } setOperationAction(ISD::BF16_TO_FP, MVT::f64, Custom); } From 966b026785a09ec079e8b0ba79358892fcb958ad Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 18:32:32 +0000 Subject: [PATCH 136/351] [NVPTX] Simplify handling of ISD::BF16_TO_FP We only use it to get from BF16 to F32. After that point, we insert an FP_EXTEND to get the rest of the way. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 63920a1db25c8..fc6c642acbc07 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -784,7 +784,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::FP_ROUND, VT, Custom); } - setOperationAction(ISD::BF16_TO_FP, MVT::f64, Custom); } // sm_80 only has conversions between f32 and bf16. Custom lower all other @@ -2526,15 +2525,13 @@ SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op, (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32) : MVT::f32; - EVT F64 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f64) - : MVT::f64; SDLoc Loc(Op); if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) { Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow); } else { Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow); } - return DAG.getNode(ISD::FP_EXTEND, Loc, F64, Op); + return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op); } } From 4247175d4536964322b129d1d3bbe6128da653bf Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 21 Feb 2024 10:55:59 -0800 Subject: [PATCH 137/351] [nfc]For InstrProfData.inc, clang-format functions and opt-out of formatting on the rest (#82057) Without this, each time `InstrProfData.inc` is modified (like in https://github.com/llvm/llvm-project/pull/81691), pre-commit CI clang-format aggressively formats many lines in an unreadable way. Pull request with red pre-commit checks are usually frowned upon. * Use `// clang-format:` instead of `/* clang-format */`. The former [allows](https://github.com/llvm/llvm-project/blob/563ef306017a47d387f1c36dd562b172c1ad0626/clang/lib/Format/Format.cpp#L4108-L4113) specifying a reason but the latter is [not](https://github.com/llvm/llvm-project/blob/563ef306017a47d387f1c36dd562b172c1ad0626/clang/lib/Format/Format.cpp#L4105-L4106). - Filed https://github.com/llvm/llvm-project/issues/82426 to track the issue in clang-format. --- compiler-rt/include/profile/InstrProfData.inc | 33 +++++++++++-------- .../llvm/ProfileData/InstrProfData.inc | 33 +++++++++++-------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index 25df899b3f361..c907a9736f316 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -62,6 +62,8 @@ #define INSTR_PROF_VISIBILITY #endif +// clang-format off:consider re-enabling clang-format if auto-formatted C macros +// are readable (e.g., after `issue #82426` is fixed) /* INSTR_PROF_DATA start. */ /* Definition of member fields of the per-function control structure. */ #ifndef INSTR_PROF_DATA @@ -494,12 +496,14 @@ getValueProfRecordHeaderSize(uint32_t NumValueSites); #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif +// clang-format on + /*! * Return the \c ValueProfRecord header size including the * padding bytes. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -uint32_t getValueProfRecordHeaderSize(uint32_t NumValueSites) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint32_t +getValueProfRecordHeaderSize(uint32_t NumValueSites) { uint32_t Size = offsetof(ValueProfRecord, SiteCountArray) + sizeof(uint8_t) * NumValueSites; /* Round the size to multiple of 8 bytes. */ @@ -511,9 +515,8 @@ uint32_t getValueProfRecordHeaderSize(uint32_t NumValueSites) { * Return the total size of the value profile record including the * header and the value data. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -uint32_t getValueProfRecordSize(uint32_t NumValueSites, - uint32_t NumValueData) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint32_t +getValueProfRecordSize(uint32_t NumValueSites, uint32_t NumValueData) { return getValueProfRecordHeaderSize(NumValueSites) + sizeof(InstrProfValueData) * NumValueData; } @@ -521,8 +524,8 @@ uint32_t getValueProfRecordSize(uint32_t NumValueSites, /*! * Return the pointer to the start of value data array. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE InstrProfValueData * +getValueProfRecordValueData(ValueProfRecord *This) { return (InstrProfValueData *)((char *)This + getValueProfRecordHeaderSize( This->NumValueSites)); } @@ -530,8 +533,8 @@ InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) { /*! * Return the total number of value data for \c This record. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint32_t +getValueProfRecordNumValueData(ValueProfRecord *This) { uint32_t NumValueData = 0; uint32_t I; for (I = 0; I < This->NumValueSites; I++) @@ -542,8 +545,8 @@ uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) { /*! * Use this method to advance to the next \c This \c ValueProfRecord. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE ValueProfRecord * +getValueProfRecordNext(ValueProfRecord *This) { uint32_t NumValueData = getValueProfRecordNumValueData(This); return (ValueProfRecord *)((char *)This + getValueProfRecordSize(This->NumValueSites, @@ -553,8 +556,8 @@ ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) { /*! * Return the first \c ValueProfRecord instance. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -ValueProfRecord *getFirstValueProfRecord(ValueProfData *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE ValueProfRecord * +getFirstValueProfRecord(ValueProfData *This) { return (ValueProfRecord *)((char *)This + sizeof(ValueProfData)); } @@ -637,6 +640,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, /*============================================================================*/ +// clang-format off:consider re-enabling clang-format if auto-formatted C macros +// are readable (e.g., after `issue #82426` is fixed) #ifndef INSTR_PROF_DATA_DEFINED #ifndef INSTR_PROF_DATA_INC @@ -903,6 +908,8 @@ int InstProfPopcountll(unsigned long long X) { return __builtin_popcountll(X); } #endif /* defined(_MSC_VER) && !defined(__clang__) */ +// clang-format on + /* Map an (observed) memop size value to the representative value of its range. * For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */ INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint64_t diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 25df899b3f361..c907a9736f316 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -62,6 +62,8 @@ #define INSTR_PROF_VISIBILITY #endif +// clang-format off:consider re-enabling clang-format if auto-formatted C macros +// are readable (e.g., after `issue #82426` is fixed) /* INSTR_PROF_DATA start. */ /* Definition of member fields of the per-function control structure. */ #ifndef INSTR_PROF_DATA @@ -494,12 +496,14 @@ getValueProfRecordHeaderSize(uint32_t NumValueSites); #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif +// clang-format on + /*! * Return the \c ValueProfRecord header size including the * padding bytes. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -uint32_t getValueProfRecordHeaderSize(uint32_t NumValueSites) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint32_t +getValueProfRecordHeaderSize(uint32_t NumValueSites) { uint32_t Size = offsetof(ValueProfRecord, SiteCountArray) + sizeof(uint8_t) * NumValueSites; /* Round the size to multiple of 8 bytes. */ @@ -511,9 +515,8 @@ uint32_t getValueProfRecordHeaderSize(uint32_t NumValueSites) { * Return the total size of the value profile record including the * header and the value data. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -uint32_t getValueProfRecordSize(uint32_t NumValueSites, - uint32_t NumValueData) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint32_t +getValueProfRecordSize(uint32_t NumValueSites, uint32_t NumValueData) { return getValueProfRecordHeaderSize(NumValueSites) + sizeof(InstrProfValueData) * NumValueData; } @@ -521,8 +524,8 @@ uint32_t getValueProfRecordSize(uint32_t NumValueSites, /*! * Return the pointer to the start of value data array. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE InstrProfValueData * +getValueProfRecordValueData(ValueProfRecord *This) { return (InstrProfValueData *)((char *)This + getValueProfRecordHeaderSize( This->NumValueSites)); } @@ -530,8 +533,8 @@ InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) { /*! * Return the total number of value data for \c This record. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint32_t +getValueProfRecordNumValueData(ValueProfRecord *This) { uint32_t NumValueData = 0; uint32_t I; for (I = 0; I < This->NumValueSites; I++) @@ -542,8 +545,8 @@ uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) { /*! * Use this method to advance to the next \c This \c ValueProfRecord. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE ValueProfRecord * +getValueProfRecordNext(ValueProfRecord *This) { uint32_t NumValueData = getValueProfRecordNumValueData(This); return (ValueProfRecord *)((char *)This + getValueProfRecordSize(This->NumValueSites, @@ -553,8 +556,8 @@ ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) { /*! * Return the first \c ValueProfRecord instance. */ -INSTR_PROF_VISIBILITY INSTR_PROF_INLINE -ValueProfRecord *getFirstValueProfRecord(ValueProfData *This) { +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE ValueProfRecord * +getFirstValueProfRecord(ValueProfData *This) { return (ValueProfRecord *)((char *)This + sizeof(ValueProfData)); } @@ -637,6 +640,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, /*============================================================================*/ +// clang-format off:consider re-enabling clang-format if auto-formatted C macros +// are readable (e.g., after `issue #82426` is fixed) #ifndef INSTR_PROF_DATA_DEFINED #ifndef INSTR_PROF_DATA_INC @@ -903,6 +908,8 @@ int InstProfPopcountll(unsigned long long X) { return __builtin_popcountll(X); } #endif /* defined(_MSC_VER) && !defined(__clang__) */ +// clang-format on + /* Map an (observed) memop size value to the representative value of its range. * For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */ INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint64_t From 9c2468821ec51defd09c246fea4a47886fff8c01 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 21 Feb 2024 13:02:30 -0600 Subject: [PATCH 138/351] [lldb][test] Modernize asserts (#82503) This uses [teyit](https://pypi.org/project/teyit/) to modernize asserts, as recommended by the [unittest release notes](https://docs.python.org/3.12/whatsnew/3.12.html#id3). For example, `assertTrue(a == b)` is replaced with `assertEqual(a, b)`. This produces better error messages, e.g. `error: unexpectedly found 1 and 2 to be different` instead of `error: False`. --- .../call-restarts/TestCallThatRestarts.py | 2 +- .../call-throws/TestCallThatThrows.py | 4 +- .../TestInvalidIteratorCompletionCrash.py | 2 +- .../commands/expression/fixits/TestFixIts.py | 8 +-- .../API/commands/expression/test/TestExprs.py | 2 +- .../unwind_expression/TestUnwindExpression.py | 5 +- .../TestSMEZRegistersSaveRestore.py | 2 +- .../TestZAThreadedDynamic.py | 2 +- .../TestSVEThreadedDynamic.py | 2 +- .../commands/session/save/TestSessionSave.py | 2 +- .../commands/statistics/basic/TestStats.py | 4 +- .../API/commands/trace/TestTraceExport.py | 2 +- lldb/test/API/commands/trace/TestTraceSave.py | 10 +-- .../TestTraceStartStopMultipleThreads.py | 6 +- .../archives/TestBSDArchives.py | 10 +-- .../functionalities/asan/TestMemoryHistory.py | 6 +- .../TestBreakpointCommand.py | 20 +++--- .../TestBreakpointCommandsFromPython.py | 4 +- .../TestBreakpointConditions.py | 4 +- .../TestRequireHWBreakpoints.py | 12 ++-- .../breakpoint/objc/TestObjCBreakpoints.py | 15 +++-- .../scripted_bkpt/TestScriptedResolver.py | 22 +++--- .../serialize/TestBreakpointSerialization.py | 2 +- .../TestStepOverBreakpoint.py | 2 +- .../TestDataFormatterPythonSynth.py | 24 ++++--- .../string/TestDataFormatterLibcxxString.py | 10 +-- .../TestDataFormatterLibcxxStringView.py | 10 +-- .../gdb_remote_client/TestMSP430MSPDebug.py | 11 +-- .../TestMultipleDebuggersCommands.py | 10 +-- .../TestProcessSaveCoreMinidump.py | 4 +- .../return-value/TestReturnValue.py | 5 +- .../scripted_process/TestScriptedProcess.py | 4 +- .../functionalities/signal/TestSendSignal.py | 4 +- .../signal/handle-abrt/TestHandleAbort.py | 4 +- .../signal/handle-segv/TestHandleSegv.py | 4 +- .../functionalities/signal/raise/TestRaise.py | 8 +-- .../step-avoids-no-debug/TestStepNoDebug.py | 5 +- .../TestStepAvoidsRegexp.py | 5 +- .../TestModuleLoadedNotifys.py | 5 +- .../break_after_join/TestBreakAfterJoin.py | 5 +- .../TestCreateDuringStep.py | 10 +-- .../exit_during_break/TestExitDuringBreak.py | 5 +- .../multi_break/TestMultipleBreakpoints.py | 5 +- .../thread/num_threads/TestNumThreads.py | 10 +-- .../thread_plan/TestThreadPlanCommands.py | 12 ++-- .../tsan/basic/TestTsanBasic.py | 2 +- .../tsan/multiple/TestTsanMultiple.py | 2 +- .../TestAArch64UnwindPAC.py | 4 +- .../TestRegisterVariables.py | 5 +- .../lang/c/stepping/TestStepAndBreakpoints.py | 7 +- .../cpp/dynamic-value/TestDynamicValue.py | 6 +- .../API/lang/cpp/namespace/TestNamespace.py | 15 +++-- lldb/test/API/lang/cpp/stl/TestSTL.py | 2 +- .../foundation/TestFoundationDisassembly.py | 4 +- .../objc-class-method/TestObjCClassMethod.py | 2 +- .../TestObjCStructArgument.py | 2 +- .../TestObjCStructReturn.py | 2 +- .../API/lang/objc/objc-super/TestObjCSuper.py | 2 +- .../objc/rdar-12408181/TestRdar12408181.py | 9 +-- .../TestObjCBuiltinTypes.py | 2 +- .../TestAArch64LinuxTaggedMemoryAccess.py | 4 +- .../aarch64/unwind_signal/TestUnwindSignal.py | 4 +- lldb/test/API/lua_api/TestLuaAPI.py | 2 +- .../function-starts/TestFunctionStarts.py | 6 +- .../TestObjCRecognizer.py | 4 +- .../TestDetachVrsProfile.py | 2 +- lldb/test/API/macosx/queues/TestQueues.py | 14 ++-- .../API/macosx/universal/TestUniversal.py | 6 +- lldb/test/API/python_api/event/TestEvents.py | 6 +- .../python_api/file_handle/TestFileHandle.py | 12 ++-- .../findvalue_duplist/TestSBFrameFindValue.py | 2 +- lldb/test/API/python_api/format/TestFormat.py | 2 +- .../formatters/TestFormattersSBAPI.py | 11 ++- .../frame/get-variables/TestGetVariables.py | 6 +- .../module_section/TestModuleAndSection.py | 6 +- .../API/python_api/process/TestProcessAPI.py | 2 +- .../python_api/process/io/TestProcessIO.py | 10 +-- lldb/test/API/python_api/sbdata/TestSBData.py | 67 ++++++++++--------- .../API/python_api/sbmodule/TestSBModule.py | 2 +- .../API/python_api/target/TestTargetAPI.py | 2 +- lldb/test/API/python_api/type/TestTypeList.py | 4 +- .../value/change_values/TestChangeValueAPI.py | 6 +- .../libcxx/atomic/TestChangeValue.py | 2 +- .../libcxx/map/TestChangeMapValue.py | 4 +- .../watchpoint/TestWatchpointIter.py | 2 +- .../API/source-manager/TestSourceManager.py | 2 +- .../lldb-dap/coreFile/TestDAP_coreFile.py | 2 +- .../disassemble/TestDAP_disassemble.py | 8 +-- .../runInTerminal/TestDAP_runInTerminal.py | 4 +- .../lldb-dap/stackTrace/TestDAP_stackTrace.py | 4 +- .../TestDAP_terminatedEvent.py | 8 +-- .../lldb-dap/variables/TestDAP_variables.py | 10 +-- .../tools/lldb-server/TestGdbRemoteAttach.py | 2 +- .../lldb-server/TestGdbRemoteAuxvSupport.py | 4 +- .../TestGdbRemoteExpeditedRegisters.py | 6 +- .../lldb-server/TestGdbRemoteRegisterState.py | 4 +- .../tools/lldb-server/TestLldbGdbServer.py | 12 ++-- .../attach-wait/TestGdbRemoteAttachWait.py | 2 +- .../TestGdbRemoteTargetXmlPacket.py | 2 +- 99 files changed, 335 insertions(+), 299 deletions(-) diff --git a/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py b/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py index 214d890db3fe3..ca08591aedb39 100644 --- a/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py +++ b/lldb/test/API/commands/expression/call-restarts/TestCallThatRestarts.py @@ -84,7 +84,7 @@ def call_function(self): handler_bkpt = target.BreakpointCreateBySourceRegex( "Got sigchld %d.", self.main_source_spec ) - self.assertTrue(handler_bkpt.GetNumLocations() > 0) + self.assertGreater(handler_bkpt.GetNumLocations(), 0) options.SetIgnoreBreakpoints(True) options.SetUnwindOnError(True) diff --git a/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py b/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py index 8d524ad9e9b6f..2868ec5ffdbdf 100644 --- a/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py +++ b/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py @@ -55,7 +55,7 @@ def call_function(self): handler_bkpt = target.BreakpointCreateBySourceRegex( "I felt like it", self.main_source_spec ) - self.assertTrue(handler_bkpt.GetNumLocations() > 0) + self.assertGreater(handler_bkpt.GetNumLocations(), 0) options.SetIgnoreBreakpoints(True) options.SetUnwindOnError(True) @@ -69,7 +69,7 @@ def call_function(self): exception_bkpt = target.BreakpointCreateForException( lldb.eLanguageTypeObjC, False, True ) - self.assertTrue(exception_bkpt.GetNumLocations() > 0) + self.assertGreater(exception_bkpt.GetNumLocations(), 0) options.SetIgnoreBreakpoints(True) options.SetUnwindOnError(True) diff --git a/lldb/test/API/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py b/lldb/test/API/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py index b031b63b4e267..4c7620a7e18db 100644 --- a/lldb/test/API/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py +++ b/lldb/test/API/commands/expression/completion-crash-invalid-iterator/TestInvalidIteratorCompletionCrash.py @@ -11,7 +11,7 @@ def test(self): callee_break = target.BreakpointCreateByName( "SomeClass::SomeClass(ParamClass)", None ) - self.assertTrue(callee_break.GetNumLocations() > 0) + self.assertGreater(callee_break.GetNumLocations(), 0) self.runCmd("run", RUN_SUCCEEDED) to_complete = "e ParamClass" diff --git a/lldb/test/API/commands/expression/fixits/TestFixIts.py b/lldb/test/API/commands/expression/fixits/TestFixIts.py index 3289bc0c5c7e0..bc53b72fe611b 100644 --- a/lldb/test/API/commands/expression/fixits/TestFixIts.py +++ b/lldb/test/API/commands/expression/fixits/TestFixIts.py @@ -79,11 +79,11 @@ def test_with_target(self): self.assertTrue(value.IsValid()) self.assertTrue(value.GetError().Fail()) error_string = value.GetError().GetCString() - self.assertTrue( - error_string.find("fixed expression suggested:") != -1, "Fix was suggested" + self.assertNotEqual( + error_string.find("fixed expression suggested:"), -1, "Fix was suggested" ) - self.assertTrue( - error_string.find("my_pointer->second.a") != -1, "Fix was right" + self.assertNotEqual( + error_string.find("my_pointer->second.a"), -1, "Fix was right" ) def test_with_target_error_applies_fixit(self): diff --git a/lldb/test/API/commands/expression/test/TestExprs.py b/lldb/test/API/commands/expression/test/TestExprs.py index 0e3215522ea6e..41faf07f8cb44 100644 --- a/lldb/test/API/commands/expression/test/TestExprs.py +++ b/lldb/test/API/commands/expression/test/TestExprs.py @@ -163,7 +163,7 @@ def test_evaluate_expression_python(self): self.DebugSBValue(val) callee_break = target.BreakpointCreateByName("a_function_to_call", None) - self.assertTrue(callee_break.GetNumLocations() > 0) + self.assertGreater(callee_break.GetNumLocations(), 0) # Make sure ignoring breakpoints works from the command line: self.expect( diff --git a/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py b/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py index bb173c0584a46..82f062876a773 100644 --- a/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py +++ b/lldb/test/API/commands/expression/unwind_expression/TestUnwindExpression.py @@ -70,8 +70,9 @@ def do_unwind_test(self, thread, bkpt, timeout): self.assertTrue(val.GetError().Fail(), "We did not complete the execution.") error_str = val.GetError().GetCString() - self.assertTrue( - "Execution was interrupted, reason: breakpoint" in error_str, + self.assertIn( + "Execution was interrupted, reason: breakpoint", + error_str, "And the reason was right.", ) diff --git a/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py index 9433aae0c53c4..40e9c821bc64d 100644 --- a/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py +++ b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py @@ -55,7 +55,7 @@ def get_supported_svg(self): # Write back the current vg to confirm read/write works at all. current_svg = self.match("register read vg", ["(0x[0-9]+)"]) - self.assertTrue(current_svg is not None) + self.assertIsNotNone(current_svg) self.expect("register write vg {}".format(current_svg.group())) # Aka 128, 256 and 512 bit. diff --git a/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/TestZAThreadedDynamic.py b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/TestZAThreadedDynamic.py index 8b1d5908d96ca..1929c46264d7d 100644 --- a/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/TestZAThreadedDynamic.py +++ b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/TestZAThreadedDynamic.py @@ -28,7 +28,7 @@ def get_supported_vg(self): ) current_vg = self.match("register read vg", ["(0x[0-9]+)"]) - self.assertTrue(current_vg is not None) + self.assertIsNotNone(current_vg) self.expect("register write vg {}".format(current_vg.group())) # Aka 128, 256 and 512 bit. diff --git a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py index 5d5914bef3546..759dde96a9f15 100644 --- a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py +++ b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py @@ -40,7 +40,7 @@ def get_supported_vg(self): # Write back the current vg to confirm read/write works at all. current_vg = self.match("register read vg", ["(0x[0-9]+)"]) - self.assertTrue(current_vg is not None) + self.assertIsNotNone(current_vg) self.expect("register write vg {}".format(current_vg.group())) # Aka 128, 256 and 512 bit. diff --git a/lldb/test/API/commands/session/save/TestSessionSave.py b/lldb/test/API/commands/session/save/TestSessionSave.py index 6a40ec1eaf037..172a764523046 100644 --- a/lldb/test/API/commands/session/save/TestSessionSave.py +++ b/lldb/test/API/commands/session/save/TestSessionSave.py @@ -47,7 +47,7 @@ def test_session_save(self): raw += self.raw_transcript_builder(cmd, res) self.assertTrue(interpreter.HasCommands()) - self.assertTrue(len(raw) != 0) + self.assertNotEqual(len(raw), 0) # Check for error cmd = "session save /root/file" diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py index 0172ac536e979..6f083222227fb 100644 --- a/lldb/test/API/commands/statistics/basic/TestStats.py +++ b/lldb/test/API/commands/statistics/basic/TestStats.py @@ -600,7 +600,7 @@ def test_had_frame_variable_errors(self): # Get stats and verify we had errors. stats = self.get_stats() exe_stats = self.find_module_in_metrics(exe, stats) - self.assertTrue(exe_stats is not None) + self.assertIsNotNone(exe_stats) # Make sure we have "debugInfoHadVariableErrors" variable that is set to # false before failing to get local variables due to missing .o file. @@ -620,7 +620,7 @@ def test_had_frame_variable_errors(self): # Get stats and verify we had errors. stats = self.get_stats() exe_stats = self.find_module_in_metrics(exe, stats) - self.assertTrue(exe_stats is not None) + self.assertIsNotNone(exe_stats) # Make sure we have "hadFrameVariableErrors" variable that is set to # true after failing to get local variables due to missing .o file. diff --git a/lldb/test/API/commands/trace/TestTraceExport.py b/lldb/test/API/commands/trace/TestTraceExport.py index 7d237f7f1846c..a6ca736d3c5a4 100644 --- a/lldb/test/API/commands/trace/TestTraceExport.py +++ b/lldb/test/API/commands/trace/TestTraceExport.py @@ -229,7 +229,7 @@ def _testHtrBasicSuperBlockPassSequenceCheck(self): index_of_first_layer_1_block = None for i, event in enumerate(data): layer_id = event.get("pid") - self.assertTrue(layer_id is not None) + self.assertIsNotNone(layer_id) if layer_id == 1 and index_of_first_layer_1_block is None: index_of_first_layer_1_block = i num_units_by_layer[layer_id] += 1 diff --git a/lldb/test/API/commands/trace/TestTraceSave.py b/lldb/test/API/commands/trace/TestTraceSave.py index cc2d373220b82..ef1ab2f7aa41c 100644 --- a/lldb/test/API/commands/trace/TestTraceSave.py +++ b/lldb/test/API/commands/trace/TestTraceSave.py @@ -103,7 +103,7 @@ def checkSessionBundle(session_file_path): with open(session_file_path) as session_file: session = json.load(session_file) # We expect tsc conversion info - self.assertTrue("tscPerfZeroConversion" in session) + self.assertIn("tscPerfZeroConversion", session) # We expect at least one cpu self.assertGreater(len(session["cpus"]), 0) @@ -152,18 +152,18 @@ def checkSessionBundle(session_file_path): copied_process = find( lambda proc: proc["pid"] == process["pid"], copy["processes"] ) - self.assertTrue(copied_process is not None) + self.assertIsNotNone(copied_process) for thread in process["threads"]: copied_thread = find( lambda thr: thr["tid"] == thread["tid"], copied_process["threads"], ) - self.assertTrue(copied_thread is not None) + self.assertIsNotNone(copied_thread) for cpu in original["cpus"]: copied_cpu = find(lambda cor: cor["id"] == cpu["id"], copy["cpus"]) - self.assertTrue(copied_cpu is not None) + self.assertIsNotNone(copied_cpu) def testSaveTrace(self): self.expect( @@ -225,7 +225,7 @@ def testSaveKernelTrace(self): original_file = json.load(original_file) with open(copied_trace_file) as copy_file: copy_file = json.load(copy_file) - self.assertTrue("kernel" in copy_file) + self.assertIn("kernel", copy_file) self.assertEqual( os.path.basename(original_file["kernel"]["file"]), diff --git a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py index 4bb58e7d027f7..c41e85fd670ba 100644 --- a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py +++ b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py @@ -236,7 +236,7 @@ def testStartPerCpuSession(self): ].strip() output = json.loads(response) - self.assertTrue(output is not None) + self.assertIsNotNone(output) self.assertIn("cpus", output) self.assertIn("tscPerfZeroConversion", output) found_non_empty_context_switch = False @@ -249,8 +249,8 @@ def testStartPerCpuSession(self): ipt_trace_size = binary_data["size"] elif binary_data["kind"] == "perfContextSwitchTrace": context_switch_size = binary_data["size"] - self.assertTrue(context_switch_size is not None) - self.assertTrue(ipt_trace_size is not None) + self.assertIsNotNone(context_switch_size) + self.assertIsNotNone(ipt_trace_size) if context_switch_size > 0: found_non_empty_context_switch = True diff --git a/lldb/test/API/functionalities/archives/TestBSDArchives.py b/lldb/test/API/functionalities/archives/TestBSDArchives.py index 570fd2ed8c0e6..1bef8e896e0be 100644 --- a/lldb/test/API/functionalities/archives/TestBSDArchives.py +++ b/lldb/test/API/functionalities/archives/TestBSDArchives.py @@ -85,13 +85,15 @@ def check_frame_variable_errors(self, thread, error_strings): api_error = var_list.GetError().GetCString() for s in error_strings: - self.assertTrue( - s in command_error, + self.assertIn( + s, + command_error, 'Make sure "%s" exists in the command error "%s"' % (s, command_error), ) for s in error_strings: - self.assertTrue( - s in api_error, + self.assertIn( + s, + api_error, 'Make sure "%s" exists in the API error "%s"' % (s, api_error), ) diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py index b746651afe28d..00162ae8822c7 100644 --- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py +++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py @@ -87,7 +87,7 @@ def asan_tests(self): self.assertEqual(threads.GetSize(), 2) history_thread = threads.GetThreadAtIndex(0) - self.assertTrue(history_thread.num_frames >= 2) + self.assertGreaterEqual(history_thread.num_frames, 2) self.assertEqual( history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), "main.c", @@ -97,7 +97,7 @@ def asan_tests(self): ) history_thread = threads.GetThreadAtIndex(1) - self.assertTrue(history_thread.num_frames >= 2) + self.assertGreaterEqual(history_thread.num_frames, 2) self.assertEqual( history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), "main.c", @@ -109,7 +109,7 @@ def asan_tests(self): # let's free the container (SBThreadCollection) and see if the # SBThreads still live threads = None - self.assertTrue(history_thread.num_frames >= 2) + self.assertGreaterEqual(history_thread.num_frames, 2) self.assertEqual( history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), "main.c", diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py index 923d8f8dc9ae5..620f648d51fd2 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py @@ -86,8 +86,9 @@ def test_breakpoints_with_relative_path_line_tables(self): ] for path in valid_paths: bkpt = target.BreakpointCreateByLocation(path, 2) - self.assertTrue( - bkpt.GetNumLocations() > 0, + self.assertGreater( + bkpt.GetNumLocations(), + 0, 'Couldn\'t resolve breakpoint using full path "%s" in executate "%s" with ' "debug info that has relative path with matching suffix" % (path, self.getBuildArtifact("a.out")), @@ -142,8 +143,9 @@ def test_breakpoints_with_bad_aranges(self): target = self.dbg.CreateTarget(obj_path) src_path = "/tmp/ab/main.cpp" bkpt = target.BreakpointCreateByLocation(src_path, 2) - self.assertTrue( - bkpt.GetNumLocations() > 0, + self.assertGreater( + bkpt.GetNumLocations(), + 0, 'Couldn\'t resolve breakpoint using "%s" in executate "%s" with ' "debug info that has a bad .debug_aranges section" % (src_path, self.getBuildArtifact("a.out")), @@ -613,8 +615,9 @@ def test_breakpoints_auto_source_map_relative(self): # is a suffix of request breakpoint file path path = "/x/y/a/b/c/main.cpp" bp = target.BreakpointCreateByLocation(path, 2) - self.assertTrue( - bp.GetNumLocations() > 0, + self.assertGreater( + bp.GetNumLocations(), + 0, 'Couldn\'t resolve breakpoint using full path "%s" in executate "%s" with ' "debug info that has relative path with matching suffix" % (path, self.getBuildArtifact("a.out")), @@ -632,8 +635,9 @@ def test_breakpoints_auto_source_map_relative(self): # equals the file path in debug info. path = "a/b/c/main.cpp" bp = target.BreakpointCreateByLocation(path, 2) - self.assertTrue( - bp.GetNumLocations() > 0, + self.assertGreater( + bp.GetNumLocations(), + 0, 'Couldn\'t resolve breakpoint using full path "%s" in executate "%s" with ' "debug info that has relative path with matching suffix" % (path, self.getBuildArtifact("a.out")), diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py index 36f37870629be..7889f08ba8dbb 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py @@ -76,7 +76,9 @@ def do_set_python_command_from_python(self): ) self.assertTrue(no_files_bkpt, VALID_BREAKPOINT) num_locations = no_files_bkpt.GetNumLocations() - self.assertTrue(num_locations >= 2, "Got at least two breakpoint locations") + self.assertGreaterEqual( + num_locations, 2, "Got at least two breakpoint locations" + ) got_one_in_A = False got_one_in_B = False for idx in range(0, num_locations): diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py b/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py index 4fcf19cf7df08..50ba0317fd094 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_conditions/TestBreakpointConditions.py @@ -143,8 +143,8 @@ def breakpoint_conditions_python(self): "The thread index should be invalid", ) # The thread name should be invalid, too. - self.assertTrue( - breakpoint.GetThreadName() is None, "The thread name should be invalid" + self.assertIsNone( + breakpoint.GetThreadName(), "The thread name should be invalid" ) # Let's set the thread index for this breakpoint and verify that it is, diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py index 5325f0f00affb..b3568653002a2 100644 --- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py @@ -45,8 +45,8 @@ def test_step_range(self): error = lldb.SBError() thread.StepInto("", 4, error) self.assertTrue(error.Fail()) - self.assertTrue( - "Could not create hardware breakpoint for thread plan" in error.GetCString() + self.assertIn( + "Could not create hardware breakpoint for thread plan", error.GetCString() ) @skipTestIfFn(HardwareBreakpointTestBase.supports_hw_breakpoints) @@ -67,8 +67,8 @@ def test_step_out(self): error = lldb.SBError() thread.StepOut(error) self.assertTrue(error.Fail()) - self.assertTrue( - "Could not create hardware breakpoint for thread plan" in error.GetCString() + self.assertIn( + "Could not create hardware breakpoint for thread plan", error.GetCString() ) @skipTestIfFn(HardwareBreakpointTestBase.supports_hw_breakpoints) @@ -107,6 +107,6 @@ def test_step_until(self): # Ensure we fail when stepping through the API. error = thread.StepOverUntil(lldb.SBFrame(), lldb.SBFileSpec(), 5) self.assertTrue(error.Fail()) - self.assertTrue( - "Could not create hardware breakpoint for thread plan" in error.GetCString() + self.assertIn( + "Could not create hardware breakpoint for thread plan", error.GetCString() ) diff --git a/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py b/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py index b81e144e44af4..29cf31563a9a9 100644 --- a/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py +++ b/lldb/test/API/functionalities/breakpoint/objc/TestObjCBreakpoints.py @@ -42,8 +42,9 @@ def check_category_breakpoints(self): ) for bp_loc in selector_bp: function_name = bp_loc.GetAddress().GetSymbol().GetName() - self.assertTrue( - " myCategoryFunction]" in function_name, + self.assertIn( + " myCategoryFunction]", + function_name, 'Make sure all function names have " myCategoryFunction]" in their names', ) @@ -108,8 +109,9 @@ def check_objc_breakpoints(self, have_dsym): ) # There are 93 on the latest MacOSX for bp_loc in selector_bp: function_name = bp_loc.GetAddress().GetSymbol().GetName() - self.assertTrue( - " count]" in function_name, + self.assertIn( + " count]", + function_name, 'Make sure all function names have " count]" in their names', ) @@ -132,8 +134,9 @@ def check_objc_breakpoints(self, have_dsym): ) for bp_loc in selector_bp: function_name = bp_loc.GetAddress().GetSymbol().GetName() - self.assertTrue( - " isEqual:]" in function_name, + self.assertIn( + " isEqual:]", + function_name, 'Make sure all function names have " isEqual:]" in their names', ) diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py index 9f477f951cd86..0a1003a1b238c 100644 --- a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py @@ -123,8 +123,8 @@ def do_test(self): # Make sure these all got locations: for i in range(0, len(right)): - self.assertTrue( - right[i].GetNumLocations() >= 1, "Breakpoint %d has no locations." % (i) + self.assertGreaterEqual( + right[i].GetNumLocations(), 1, "Breakpoint %d has no locations." % (i) ) # Now some ones that won't take: @@ -229,7 +229,7 @@ def do_test_depths(self): bkpt = target.BreakpointCreateFromScript( "resolver.Resolver", extra_args, module_list, file_list ) - self.assertTrue(bkpt.GetNumLocations() > 0, "Resolver got no locations.") + self.assertGreater(bkpt.GetNumLocations(), 0, "Resolver got no locations.") self.expect( "script print(resolver.Resolver.got_files)", substrs=["2"], @@ -240,8 +240,8 @@ def do_test_depths(self): bkpt = target.BreakpointCreateFromScript( "resolver.ResolverModuleDepth", extra_args, module_list, file_list ) - self.assertTrue( - bkpt.GetNumLocations() > 0, "ResolverModuleDepth got no locations." + self.assertGreater( + bkpt.GetNumLocations(), 0, "ResolverModuleDepth got no locations." ) self.expect( "script print(resolver.Resolver.got_files)", @@ -253,7 +253,9 @@ def do_test_depths(self): bkpt = target.BreakpointCreateFromScript( "resolver.ResolverCUDepth", extra_args, module_list, file_list ) - self.assertTrue(bkpt.GetNumLocations() > 0, "ResolverCUDepth got no locations.") + self.assertGreater( + bkpt.GetNumLocations(), 0, "ResolverCUDepth got no locations." + ) self.expect( "script print(resolver.Resolver.got_files)", substrs=["1"], @@ -264,8 +266,8 @@ def do_test_depths(self): bkpt = target.BreakpointCreateFromScript( "resolver.ResolverBadDepth", extra_args, module_list, file_list ) - self.assertTrue( - bkpt.GetNumLocations() > 0, "ResolverBadDepth got no locations." + self.assertGreater( + bkpt.GetNumLocations(), 0, "ResolverBadDepth got no locations." ) self.expect( "script print(resolver.Resolver.got_files)", @@ -277,8 +279,8 @@ def do_test_depths(self): bkpt = target.BreakpointCreateFromScript( "resolver.ResolverFuncDepth", extra_args, module_list, file_list ) - self.assertTrue( - bkpt.GetNumLocations() > 0, "ResolverFuncDepth got no locations." + self.assertGreater( + bkpt.GetNumLocations(), 0, "ResolverFuncDepth got no locations." ) self.expect( "script print(resolver.Resolver.got_files)", diff --git a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py index 985bafabdc5bc..411ce9c67da02 100644 --- a/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py +++ b/lldb/test/API/functionalities/breakpoint/serialize/TestBreakpointSerialization.py @@ -60,7 +60,7 @@ def test_resolver_serialization(self): exe_module.IsValid(), "Failed to find the executable module in target" ) sym_ctx_list = exe_module.FindFunctions("main") - self.assertTrue(sym_ctx_list.GetSize() == 1, "Unable to find function 'main'") + self.assertEqual(sym_ctx_list.GetSize(), 1, "Unable to find function 'main'") sym_ctx = sym_ctx_list.GetContextAtIndex(0) self.assertTrue( sym_ctx.IsValid(), "SBSymbolContext representing function 'main' is invalid" diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py index 15539b022bc85..3a7440a31677a 100644 --- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py +++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py @@ -87,7 +87,7 @@ def test_step_instruction(self): self.thread.GetFrameAtIndex(0).GetLineEntry().GetLine(), self.line4 ) # breakpoint_2 and _3 should not affect step count - self.assertTrue(step_count >= steps_expected) + self.assertGreaterEqual(step_count, steps_expected) break # Run the process until termination diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py b/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py index cae1e3b498b06..512840b9f0655 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py @@ -315,11 +315,13 @@ def cleanup(): if self.TraceOn(): print(str_cast) - self.assertTrue(str_cast.find("A") != -1, "could not find A in output") - self.assertTrue(str_cast.find("B") != -1, "could not find B in output") - self.assertTrue(str_cast.find("C") != -1, "could not find C in output") - self.assertTrue(str_cast.find("D") != -1, "could not find D in output") - self.assertTrue(str_cast.find("4 = '\\0'") != -1, "could not find item 4 == 0") + self.assertNotEqual(str_cast.find("A"), -1, "could not find A in output") + self.assertNotEqual(str_cast.find("B"), -1, "could not find B in output") + self.assertNotEqual(str_cast.find("C"), -1, "could not find C in output") + self.assertNotEqual(str_cast.find("D"), -1, "could not find D in output") + self.assertNotEqual( + str_cast.find("4 = '\\0'"), -1, "could not find item 4 == 0" + ) self.dbg.GetSelectedTarget().GetProcess().GetSelectedThread().StepOver() @@ -331,8 +333,10 @@ def cleanup(): # we detect that all the values of the child objects have changed - but the counter-generated item # is still fixed at 0 because it is cached - this would fail if update(self): in ftsp returned False # or if synthetic children were not being preserved - self.assertTrue(str_cast.find("Q") != -1, "could not find Q in output") - self.assertTrue(str_cast.find("X") != -1, "could not find X in output") - self.assertTrue(str_cast.find("T") != -1, "could not find T in output") - self.assertTrue(str_cast.find("F") != -1, "could not find F in output") - self.assertTrue(str_cast.find("4 = '\\0'") != -1, "could not find item 4 == 0") + self.assertNotEqual(str_cast.find("Q"), -1, "could not find Q in output") + self.assertNotEqual(str_cast.find("X"), -1, "could not find X in output") + self.assertNotEqual(str_cast.find("T"), -1, "could not find T in output") + self.assertNotEqual(str_cast.find("F"), -1, "could not find F in output") + self.assertNotEqual( + str_cast.find("4 = '\\0'"), -1, "could not find item 4 == 0" + ) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py index e2d2e67f1e885..98438742a11ca 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py @@ -83,17 +83,17 @@ def cleanup(): uncappedSummaryStream = lldb.SBStream() TheVeryLongOne.GetSummary(uncappedSummaryStream, summaryOptions) uncappedSummary = uncappedSummaryStream.GetData() - self.assertTrue( - uncappedSummary.find("someText") > 0, + self.assertGreater( + uncappedSummary.find("someText"), + 0, "uncappedSummary does not include the full string", ) summaryOptions.SetCapping(lldb.eTypeSummaryCapped) cappedSummaryStream = lldb.SBStream() TheVeryLongOne.GetSummary(cappedSummaryStream, summaryOptions) cappedSummary = cappedSummaryStream.GetData() - self.assertTrue( - cappedSummary.find("someText") <= 0, - "cappedSummary includes the full string", + self.assertLessEqual( + cappedSummary.find("someText"), 0, "cappedSummary includes the full string" ) self.expect_expr( diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py index 660eb09db20da..eb7b394660b4b 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py @@ -105,17 +105,17 @@ def cleanup(): uncappedSummaryStream = lldb.SBStream() TheVeryLongOne.GetSummary(uncappedSummaryStream, summaryOptions) uncappedSummary = uncappedSummaryStream.GetData() - self.assertTrue( - uncappedSummary.find("someText") > 0, + self.assertGreater( + uncappedSummary.find("someText"), + 0, "uncappedSummary does not include the full string", ) summaryOptions.SetCapping(lldb.eTypeSummaryCapped) cappedSummaryStream = lldb.SBStream() TheVeryLongOne.GetSummary(cappedSummaryStream, summaryOptions) cappedSummary = cappedSummaryStream.GetData() - self.assertTrue( - cappedSummary.find("someText") <= 0, - "cappedSummary includes the full string", + self.assertLessEqual( + cappedSummary.find("someText"), 0, "cappedSummary includes the full string" ) self.expect_expr( diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestMSP430MSPDebug.py b/lldb/test/API/functionalities/gdb_remote_client/TestMSP430MSPDebug.py index 1359db491c24e..1dc6f89d1ea00 100644 --- a/lldb/test/API/functionalities/gdb_remote_client/TestMSP430MSPDebug.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestMSP430MSPDebug.py @@ -74,8 +74,8 @@ def test(self): # Test if the breakpoint address is resolved correctly self.assertEqual(bp.GetNumLocations(), 1, "Only one location") bp_loc = bp.GetLocationAtIndex(0) - self.assertTrue( - bp_loc.GetAddress().GetLoadAddress(target) == 0x510, "Address of main" + self.assertEqual( + bp_loc.GetAddress().GetLoadAddress(target), 0x510, "Address of main" ) # Test if the process stops at the breakpoint @@ -93,8 +93,9 @@ def test(self): # Test if thread can step a single instruction thread.StepInstruction(False) - self.assertTrue( - thread.GetFrameAtIndex(0).GetPCAddress().GetLoadAddress(target) == 0x516, + self.assertEqual( + thread.GetFrameAtIndex(0).GetPCAddress().GetLoadAddress(target), + 0x516, "Address of the next instruction", ) @@ -122,6 +123,6 @@ def test(self): self.assertEqual(reg.GetValueAsUnsigned(), reg_val_dict[reg.GetName()]) # Check if backtracing works: - self.assertTrue(len(thread.frames) >= 3) + self.assertGreaterEqual(len(thread.frames), 3) crt0_addr = thread.GetFrameAtIndex(2).GetPCAddress().GetLoadAddress(target) self.assertEqual(crt0_addr, 0x50A) diff --git a/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py b/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py index 4ed08833632db..98d7e8e9212fa 100644 --- a/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py +++ b/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py @@ -20,8 +20,9 @@ def test_multipledebuggers_commands(self): retobj = lldb.SBCommandReturnObject() interpreter_1.HandleCommand("apropos env", retobj) - self.assertTrue( - magic_text in str(retobj), + self.assertIn( + magic_text, + str(retobj), "[interpreter_1]: the output does not contain the correct words", ) @@ -37,8 +38,9 @@ def test_multipledebuggers_commands(self): retobj = lldb.SBCommandReturnObject() interpreter_2.HandleCommand("apropos env", retobj) - self.assertTrue( - magic_text in str(retobj), + self.assertIn( + magic_text, + str(retobj), "[interpreter_2]: the output does not contain the correct words", ) diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index a0e2afbf477d0..9fe5e89142987 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -22,7 +22,7 @@ def verify_core_file( self.assertTrue(process, PROCESS_IS_VALID) self.assertTrue(process.GetProcessInfo().IsValid()) self.assertEqual(process.GetProcessInfo().GetProcessID(), expected_pid) - self.assertTrue(target.GetTriple().find("linux") != -1) + self.assertNotEqual(target.GetTriple().find("linux"), -1) self.assertTrue(target.GetNumModules(), len(expected_modules)) self.assertEqual(process.GetNumThreads(), len(expected_threads)) @@ -40,7 +40,7 @@ def verify_core_file( thread = process.GetThreadAtIndex(thread_idx) self.assertTrue(thread.IsValid()) thread_id = thread.GetThreadID() - self.assertTrue(thread_id in expected_threads) + self.assertIn(thread_id, expected_threads) self.dbg.DeleteTarget(target) @skipUnlessArch("x86_64") diff --git a/lldb/test/API/functionalities/return-value/TestReturnValue.py b/lldb/test/API/functionalities/return-value/TestReturnValue.py index 3c212a3be660b..89e6e67583d7c 100644 --- a/lldb/test/API/functionalities/return-value/TestReturnValue.py +++ b/lldb/test/API/functionalities/return-value/TestReturnValue.py @@ -261,8 +261,9 @@ def return_and_test_struct_value(self, func_name): # Set the breakpoint, run to it, finish out. bkpt = self.target.BreakpointCreateByName(func_name) - self.assertTrue( - bkpt.GetNumResolvedLocations() > 0, + self.assertGreater( + bkpt.GetNumResolvedLocations(), + 0, "Got wrong number of locations for {0}".format(func_name), ) diff --git a/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py b/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py index 837ceea22ad16..5aaf68575623c 100644 --- a/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py +++ b/lldb/test/API/functionalities/scripted_process/TestScriptedProcess.py @@ -201,9 +201,7 @@ def cleanup(): py_impl = process_0.GetScriptedImplementation() self.assertTrue(py_impl) - self.assertTrue( - isinstance(py_impl, dummy_scripted_process.DummyScriptedProcess) - ) + self.assertIsInstance(py_impl, dummy_scripted_process.DummyScriptedProcess) self.assertFalse(hasattr(py_impl, "my_super_secret_member")) py_impl.my_super_secret_member = 42 self.assertTrue(hasattr(py_impl, "my_super_secret_member")) diff --git a/lldb/test/API/functionalities/signal/TestSendSignal.py b/lldb/test/API/functionalities/signal/TestSendSignal.py index 94d435a5fb17f..50435572c4d83 100644 --- a/lldb/test/API/functionalities/signal/TestSendSignal.py +++ b/lldb/test/API/functionalities/signal/TestSendSignal.py @@ -75,8 +75,8 @@ def test_with_run_command(self): self.assertEqual(len(threads), 1, "One thread stopped for a signal.") thread = threads[0] - self.assertTrue( - thread.GetStopReasonDataCount() >= 1, "There was data in the event." + self.assertGreaterEqual( + thread.GetStopReasonDataCount(), 1, "There was data in the event." ) self.assertEqual( thread.GetStopReasonDataAtIndex(0), diff --git a/lldb/test/API/functionalities/signal/handle-abrt/TestHandleAbort.py b/lldb/test/API/functionalities/signal/handle-abrt/TestHandleAbort.py index a20a6a9040729..488ae9833c9ee 100644 --- a/lldb/test/API/functionalities/signal/handle-abrt/TestHandleAbort.py +++ b/lldb/test/API/functionalities/signal/handle-abrt/TestHandleAbort.py @@ -33,8 +33,8 @@ def test_inferior_handle_sigabrt(self): self.assertTrue( thread and thread.IsValid(), "Thread should be stopped due to a signal" ) - self.assertTrue( - thread.GetStopReasonDataCount() >= 1, "There should be data in the event." + self.assertGreaterEqual( + thread.GetStopReasonDataCount(), 1, "There should be data in the event." ) self.assertEqual( thread.GetStopReasonDataAtIndex(0), diff --git a/lldb/test/API/functionalities/signal/handle-segv/TestHandleSegv.py b/lldb/test/API/functionalities/signal/handle-segv/TestHandleSegv.py index 65718f8852048..de0d9e392d810 100644 --- a/lldb/test/API/functionalities/signal/handle-segv/TestHandleSegv.py +++ b/lldb/test/API/functionalities/signal/handle-segv/TestHandleSegv.py @@ -29,8 +29,8 @@ def test_inferior_handle_sigsegv(self): self.assertTrue( thread and thread.IsValid(), "Thread should be stopped due to a signal" ) - self.assertTrue( - thread.GetStopReasonDataCount() >= 1, "There was data in the event." + self.assertGreaterEqual( + thread.GetStopReasonDataCount(), 1, "There was data in the event." ) self.assertEqual( thread.GetStopReasonDataAtIndex(0), signo, "The stop signal was SIGSEGV" diff --git a/lldb/test/API/functionalities/signal/raise/TestRaise.py b/lldb/test/API/functionalities/signal/raise/TestRaise.py index 874b06a2de3ed..9a6f2e6b70f39 100644 --- a/lldb/test/API/functionalities/signal/raise/TestRaise.py +++ b/lldb/test/API/functionalities/signal/raise/TestRaise.py @@ -88,8 +88,8 @@ def signal_test(self, signal, test_passing): self.assertState(process.GetState(), lldb.eStateStopped) thread = lldbutil.get_stopped_thread(process, lldb.eStopReasonSignal) self.assertTrue(thread.IsValid(), "Thread should be stopped due to a signal") - self.assertTrue( - thread.GetStopReasonDataCount() >= 1, "There was data in the event." + self.assertGreaterEqual( + thread.GetStopReasonDataCount(), 1, "There was data in the event." ) self.assertEqual( thread.GetStopReasonDataAtIndex(0), signo, "The stop signal was %s" % signal @@ -137,8 +137,8 @@ def signal_test(self, signal, test_passing): self.assertState(process.GetState(), lldb.eStateStopped) thread = lldbutil.get_stopped_thread(process, lldb.eStopReasonSignal) self.assertTrue(thread.IsValid(), "Thread should be stopped due to a signal") - self.assertTrue( - thread.GetStopReasonDataCount() >= 1, "There was data in the event." + self.assertGreaterEqual( + thread.GetStopReasonDataCount(), 1, "There was data in the event." ) self.assertEqual( thread.GetStopReasonDataAtIndex(0), diff --git a/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py b/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py index 984a9c1b51c57..a5ec87274a5ba 100644 --- a/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py +++ b/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py @@ -83,8 +83,9 @@ def hit_correct_line(self, pattern): def hit_correct_function(self, pattern): name = self.thread.frames[0].GetFunctionName() - self.assertTrue( - pattern in name, + self.assertIn( + pattern, + name, "Got to '%s' not the expected function '%s'." % (name, pattern), ) diff --git a/lldb/test/API/functionalities/step-avoids-regexp/TestStepAvoidsRegexp.py b/lldb/test/API/functionalities/step-avoids-regexp/TestStepAvoidsRegexp.py index 26ab880c6ca67..ca56436eeae1e 100644 --- a/lldb/test/API/functionalities/step-avoids-regexp/TestStepAvoidsRegexp.py +++ b/lldb/test/API/functionalities/step-avoids-regexp/TestStepAvoidsRegexp.py @@ -11,8 +11,9 @@ class StepAvoidsRegexTestCase(TestBase): def hit_correct_function(self, pattern): name = self.thread.frames[0].GetFunctionName() - self.assertTrue( - pattern in name, + self.assertIn( + pattern, + name, "Got to '%s' not the expected function '%s'." % (name, pattern), ) diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py index c454c28b22501..abf761fb3420b 100644 --- a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py +++ b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py @@ -85,8 +85,9 @@ def test_launch_notifications(self): # when reading dyld from the expanded shared cache. exe_basename = lldb.SBFileSpec(exe).basename if module.file.basename not in ["dyld", exe_basename]: - self.assertTrue( - module not in already_loaded_modules, + self.assertNotIn( + module, + already_loaded_modules, "{} is already loaded".format(module), ) already_loaded_modules.append(module) diff --git a/lldb/test/API/functionalities/thread/break_after_join/TestBreakAfterJoin.py b/lldb/test/API/functionalities/thread/break_after_join/TestBreakAfterJoin.py index 45533abc2b2b7..eb5692e782e08 100644 --- a/lldb/test/API/functionalities/thread/break_after_join/TestBreakAfterJoin.py +++ b/lldb/test/API/functionalities/thread/break_after_join/TestBreakAfterJoin.py @@ -73,8 +73,9 @@ def test(self): num_threads = process.GetNumThreads() # Make sure we see at least six threads - self.assertTrue( - num_threads >= 6, + self.assertGreaterEqual( + num_threads, + 6, "Number of expected threads and actual threads do not match.", ) diff --git a/lldb/test/API/functionalities/thread/create_during_step/TestCreateDuringStep.py b/lldb/test/API/functionalities/thread/create_during_step/TestCreateDuringStep.py index 597a83edad3dc..44851ed4f5482 100644 --- a/lldb/test/API/functionalities/thread/create_during_step/TestCreateDuringStep.py +++ b/lldb/test/API/functionalities/thread/create_during_step/TestCreateDuringStep.py @@ -137,12 +137,14 @@ def create_during_step_base(self, step_cmd, step_stop_reason): current_line = frame.GetLineEntry().GetLine() # Make sure we're still where we thought we were - self.assertTrue( - current_line >= self.breakpoint, + self.assertGreaterEqual( + current_line, + self.breakpoint, "Stepped to unexpected line, " + str(current_line), ) - self.assertTrue( - current_line <= self.continuepoint, + self.assertLessEqual( + current_line, + self.continuepoint, "Stepped to unexpected line, " + str(current_line), ) diff --git a/lldb/test/API/functionalities/thread/exit_during_break/TestExitDuringBreak.py b/lldb/test/API/functionalities/thread/exit_during_break/TestExitDuringBreak.py index f377716f3bf8f..41559b34b074e 100644 --- a/lldb/test/API/functionalities/thread/exit_during_break/TestExitDuringBreak.py +++ b/lldb/test/API/functionalities/thread/exit_during_break/TestExitDuringBreak.py @@ -49,8 +49,9 @@ def test(self): num_threads = process.GetNumThreads() # Make sure we see at least five threads - self.assertTrue( - num_threads >= 5, + self.assertGreaterEqual( + num_threads, + 5, "Number of expected threads and actual threads do not match.", ) diff --git a/lldb/test/API/functionalities/thread/multi_break/TestMultipleBreakpoints.py b/lldb/test/API/functionalities/thread/multi_break/TestMultipleBreakpoints.py index 8ae5d0eee75a5..bfd712037f0df 100644 --- a/lldb/test/API/functionalities/thread/multi_break/TestMultipleBreakpoints.py +++ b/lldb/test/API/functionalities/thread/multi_break/TestMultipleBreakpoints.py @@ -60,8 +60,9 @@ def test(self): num_threads = process.GetNumThreads() # Make sure we see all three threads - self.assertTrue( - num_threads >= 3, + self.assertGreaterEqual( + num_threads, + 3, "Number of expected threads and actual threads do not match.", ) diff --git a/lldb/test/API/functionalities/thread/num_threads/TestNumThreads.py b/lldb/test/API/functionalities/thread/num_threads/TestNumThreads.py index 7e4059cc0dfc7..ee9b14f15b6e9 100644 --- a/lldb/test/API/functionalities/thread/num_threads/TestNumThreads.py +++ b/lldb/test/API/functionalities/thread/num_threads/TestNumThreads.py @@ -63,8 +63,9 @@ def test_number_of_threads(self): # Using std::thread may involve extra threads, so we assert that there are # at least 4 rather than exactly 4. - self.assertTrue( - num_threads >= 13, + self.assertGreaterEqual( + num_threads, + 13, "Number of expected threads and actual threads do not match.", ) @@ -98,8 +99,9 @@ def test_unique_stacks(self): # Using std::thread may involve extra threads, so we assert that there are # at least 10 thread3's rather than exactly 10. - self.assertTrue( - num_threads >= 10, + self.assertGreaterEqual( + num_threads, + 10, "Number of expected threads and actual threads do not match.", ) diff --git a/lldb/test/API/functionalities/thread_plan/TestThreadPlanCommands.py b/lldb/test/API/functionalities/thread_plan/TestThreadPlanCommands.py index 7d03f4e884cbe..86ddabd30d240 100644 --- a/lldb/test/API/functionalities/thread_plan/TestThreadPlanCommands.py +++ b/lldb/test/API/functionalities/thread_plan/TestThreadPlanCommands.py @@ -121,8 +121,8 @@ def thread_plan_test(self): call_me_bkpt = target.BreakpointCreateBySourceRegex( "Set another here", self.main_source_file ) - self.assertTrue( - call_me_bkpt.GetNumLocations() > 0, "Set the breakpoint successfully" + self.assertGreater( + call_me_bkpt.GetNumLocations(), 0, "Set the breakpoint successfully" ) thread.StepUsingScriptedThreadPlan("wrap_step_over.WrapStepOver") threads = lldbutil.get_threads_stopped_at_breakpoint(process, call_me_bkpt) @@ -170,14 +170,14 @@ def thread_plan_test(self): second_step_bkpt = target.BreakpointCreateBySourceRegex( "Run here to step over again", self.main_source_file ) - self.assertTrue( - second_step_bkpt.GetNumLocations() > 0, "Set the breakpoint successfully" + self.assertGreater( + second_step_bkpt.GetNumLocations(), 0, "Set the breakpoint successfully" ) final_bkpt = target.BreakpointCreateBySourceRegex( "Make sure we get here on last continue", self.main_source_file ) - self.assertTrue( - final_bkpt.GetNumLocations() > 0, "Set the breakpoint successfully" + self.assertGreater( + final_bkpt.GetNumLocations(), 0, "Set the breakpoint successfully" ) threads = lldbutil.continue_to_breakpoint(process, second_step_bkpt) diff --git a/lldb/test/API/functionalities/tsan/basic/TestTsanBasic.py b/lldb/test/API/functionalities/tsan/basic/TestTsanBasic.py index de20d6ae8e2f3..ca8b74e35dff6 100644 --- a/lldb/test/API/functionalities/tsan/basic/TestTsanBasic.py +++ b/lldb/test/API/functionalities/tsan/basic/TestTsanBasic.py @@ -102,7 +102,7 @@ def tsan_tests(self): backtraces = thread.GetStopReasonExtendedBacktraces( lldb.eInstrumentationRuntimeTypeThreadSanitizer ) - self.assertTrue(backtraces.GetSize() >= 2) + self.assertGreaterEqual(backtraces.GetSize(), 2) # First backtrace is a memory operation thread = backtraces.GetThreadAtIndex(0) diff --git a/lldb/test/API/functionalities/tsan/multiple/TestTsanMultiple.py b/lldb/test/API/functionalities/tsan/multiple/TestTsanMultiple.py index 6d844fc9b0734..435e18084a79b 100644 --- a/lldb/test/API/functionalities/tsan/multiple/TestTsanMultiple.py +++ b/lldb/test/API/functionalities/tsan/multiple/TestTsanMultiple.py @@ -84,7 +84,7 @@ def tsan_tests(self): lldb.eInstrumentationRuntimeTypeThreadSanitizer ) ) - self.assertTrue(backtraces.GetSize() >= 1) + self.assertGreaterEqual(backtraces.GetSize(), 1) self.runCmd("continue") diff --git a/lldb/test/API/functionalities/unwind/aarch64_unwind_pac/TestAArch64UnwindPAC.py b/lldb/test/API/functionalities/unwind/aarch64_unwind_pac/TestAArch64UnwindPAC.py index e98df17ad43db..17e120d93f065 100644 --- a/lldb/test/API/functionalities/unwind/aarch64_unwind_pac/TestAArch64UnwindPAC.py +++ b/lldb/test/API/functionalities/unwind/aarch64_unwind_pac/TestAArch64UnwindPAC.py @@ -49,7 +49,9 @@ def test(self): "_start", ] - self.assertTrue(thread.GetNumFrames() >= (len(backtrace) + len(libc_backtrace))) + self.assertGreaterEqual( + thread.GetNumFrames(), len(backtrace) + len(libc_backtrace) + ) # Strictly check frames that are in the test program's source. for frame_idx, frame in enumerate(thread.frames[: len(backtrace)]): diff --git a/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py b/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py index 0f91aa5d044ee..0e51d4f7ff468 100644 --- a/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py +++ b/lldb/test/API/lang/c/register_variables/TestRegisterVariables.py @@ -148,8 +148,9 @@ def test_and_run_command(self): ) # Validate that we verified at least one register variable - self.assertTrue( - register_variables_count > 0, + self.assertGreater( + register_variables_count, + 0, "expected to verify at least one variable in a register", ) self.trace( diff --git a/lldb/test/API/lang/c/stepping/TestStepAndBreakpoints.py b/lldb/test/API/lang/c/stepping/TestStepAndBreakpoints.py index 15b65ad35bcdb..8d0de40cdd7b6 100644 --- a/lldb/test/API/lang/c/stepping/TestStepAndBreakpoints.py +++ b/lldb/test/API/lang/c/stepping/TestStepAndBreakpoints.py @@ -82,7 +82,7 @@ def test_and_python_api(self): # Check that the stop ID increases: new_stop_id = process.GetStopID() - self.assertTrue(new_stop_id > old_stop_id, "Stop ID increases monotonically.") + self.assertGreater(new_stop_id, old_stop_id, "Stop ID increases monotonically.") thread = threads[0] @@ -141,8 +141,9 @@ def test_and_python_api(self): "Expression calling doesn't change stop ID", ) - self.assertTrue( - stop_id_after_including_expressions > stop_id_before_including_expressions, + self.assertGreater( + stop_id_after_including_expressions, + stop_id_before_including_expressions, "Stop ID including expressions increments over expression call.", ) diff --git a/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py b/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py index 44a584f7dd63e..60a2590e1559d 100644 --- a/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py +++ b/lldb/test/API/lang/cpp/dynamic-value/TestDynamicValue.py @@ -144,7 +144,7 @@ def test_get_dynamic_vals(self): anotherA_dynamic_typename = anotherA_dynamic.GetTypeName() self.assertNotEqual(anotherA_dynamic_typename.find("B"), -1) - self.assertTrue(anotherA_dynamic_addr < anotherA_static_addr) + self.assertLess(anotherA_dynamic_addr, anotherA_static_addr) anotherA_m_b_value_dynamic = anotherA_dynamic.GetChildMemberWithName( "m_b_value", True @@ -204,7 +204,7 @@ def examine_value_object_of_this_ptr( # And that the static address is greater than the dynamic one - self.assertTrue(this_static_loc > this_dynamic_loc) + self.assertGreater(this_static_loc, this_dynamic_loc) # Now read m_b_value which is only in the dynamic value: @@ -252,4 +252,4 @@ def examine_value_object_of_this_ptr( contained_b_addr = int(contained_b.GetValue(), 16) contained_b_static_addr = int(contained_b_static.GetValue(), 16) - self.assertTrue(contained_b_addr < contained_b_static_addr) + self.assertLess(contained_b_addr, contained_b_static_addr) diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespace.py b/lldb/test/API/lang/cpp/namespace/TestNamespace.py index 3006699b6623a..d747e2be77c8e 100644 --- a/lldb/test/API/lang/cpp/namespace/TestNamespace.py +++ b/lldb/test/API/lang/cpp/namespace/TestNamespace.py @@ -32,8 +32,9 @@ def test_breakpoints_func_auto(self): ) for bp_loc in bp: name = bp_loc.GetAddress().GetFunction().GetName() - self.assertTrue( - name in names, + self.assertIn( + name, + names, "make sure breakpoint locations are correct for 'func' with eFunctionNameTypeAuto", ) @@ -61,8 +62,9 @@ def test_breakpoints_func_full(self): ) for bp_loc in bp: name = bp_loc.GetAddress().GetFunction().GetName() - self.assertTrue( - name in names, + self.assertIn( + name, + names, "make sure breakpoint locations are correct for 'func' with eFunctionNameTypeFull", ) @@ -88,8 +90,9 @@ def test_breakpoints_a_func_full(self): ) for bp_loc in bp: name = bp_loc.GetAddress().GetFunction().GetName() - self.assertTrue( - name in names, + self.assertIn( + name, + names, "make sure breakpoint locations are correct for 'A::func' with eFunctionNameTypeFull", ) diff --git a/lldb/test/API/lang/cpp/stl/TestSTL.py b/lldb/test/API/lang/cpp/stl/TestSTL.py index d7d75b25aa522..ee4f04661610f 100644 --- a/lldb/test/API/lang/cpp/stl/TestSTL.py +++ b/lldb/test/API/lang/cpp/stl/TestSTL.py @@ -50,7 +50,7 @@ def test_SBType_template_aspects(self): self.DebugSBType(map_type) self.assertTrue(map_type, VALID_TYPE) num_template_args = map_type.GetNumberOfTemplateArguments() - self.assertTrue(num_template_args > 0) + self.assertGreater(num_template_args, 0) # We expect the template arguments to contain at least 'string' and # 'int'. diff --git a/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py b/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py index 301f9cb90f5c9..245313d683774 100644 --- a/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py +++ b/lldb/test/API/lang/objc/foundation/TestFoundationDisassembly.py @@ -34,9 +34,7 @@ def test_foundation_disasm(self): foundation_framework = module.file.fullpath break - self.assertTrue( - foundation_framework is not None, "Foundation.framework path located" - ) + self.assertIsNotNone(foundation_framework, "Foundation.framework path located") self.runCmd("image dump symtab '%s'" % foundation_framework) raw_output = self.res.GetOutput() # Now, grab every 'Code' symbol and feed it into the command: diff --git a/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py b/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py index 29f5aeec1418d..473d6241485ec 100644 --- a/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py +++ b/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py @@ -35,7 +35,7 @@ def test_with_python_api(self): thread_list = lldbutil.get_threads_stopped_at_breakpoint(process, bpt) # Make sure we stopped at the first breakpoint. - self.assertTrue(len(thread_list) != 0, "No thread stopped at our breakpoint.") + self.assertNotEqual(len(thread_list), 0, "No thread stopped at our breakpoint.") self.assertEqual( len(thread_list), 1, "More than one thread stopped at our breakpoint." ) diff --git a/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py b/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py index 9ffcc715ad830..480d99523e8a2 100644 --- a/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py +++ b/lldb/test/API/lang/objc/objc-struct-argument/TestObjCStructArgument.py @@ -38,7 +38,7 @@ def test_with_python_api(self): thread_list = lldbutil.get_threads_stopped_at_breakpoint(process, bpt) # Make sure we stopped at the first breakpoint. - self.assertTrue(len(thread_list) != 0, "No thread stopped at our breakpoint.") + self.assertNotEqual(len(thread_list), 0, "No thread stopped at our breakpoint.") self.assertEqual( len(thread_list), 1, "More than one thread stopped at our breakpoint." ) diff --git a/lldb/test/API/lang/objc/objc-struct-return/TestObjCStructReturn.py b/lldb/test/API/lang/objc/objc-struct-return/TestObjCStructReturn.py index d704ed2114f45..520b89c7e2199 100644 --- a/lldb/test/API/lang/objc/objc-struct-return/TestObjCStructReturn.py +++ b/lldb/test/API/lang/objc/objc-struct-return/TestObjCStructReturn.py @@ -36,7 +36,7 @@ def test_with_python_api(self): thread_list = lldbutil.get_threads_stopped_at_breakpoint(process, bpt) # Make sure we stopped at the first breakpoint. - self.assertTrue(len(thread_list) != 0, "No thread stopped at our breakpoint.") + self.assertNotEqual(len(thread_list), 0, "No thread stopped at our breakpoint.") self.assertEqual( len(thread_list), 1, "More than one thread stopped at our breakpoint." ) diff --git a/lldb/test/API/lang/objc/objc-super/TestObjCSuper.py b/lldb/test/API/lang/objc/objc-super/TestObjCSuper.py index 8e446224c0ade..eb640dc20af94 100644 --- a/lldb/test/API/lang/objc/objc-super/TestObjCSuper.py +++ b/lldb/test/API/lang/objc/objc-super/TestObjCSuper.py @@ -36,7 +36,7 @@ def test_with_python_api(self): thread_list = lldbutil.get_threads_stopped_at_breakpoint(process, bpt) # Make sure we stopped at the first breakpoint. - self.assertTrue(len(thread_list) != 0, "No thread stopped at our breakpoint.") + self.assertNotEqual(len(thread_list), 0, "No thread stopped at our breakpoint.") self.assertEqual( len(thread_list), 1, "More than one thread stopped at our breakpoint." ) diff --git a/lldb/test/API/lang/objc/rdar-12408181/TestRdar12408181.py b/lldb/test/API/lang/objc/rdar-12408181/TestRdar12408181.py index 50cb20a3de470..80942a8abd6ed 100644 --- a/lldb/test/API/lang/objc/rdar-12408181/TestRdar12408181.py +++ b/lldb/test/API/lang/objc/rdar-12408181/TestRdar12408181.py @@ -49,11 +49,12 @@ def test_nswindow_count(self): ): window = self.frame().FindVariable("window") window_dynamic = window.GetDynamicValue(lldb.eDynamicCanRunTarget) - self.assertTrue( - window.GetNumChildren() > 1, "NSWindow (static) only has 1 child!" + self.assertGreater( + window.GetNumChildren(), 1, "NSWindow (static) only has 1 child!" ) - self.assertTrue( - window_dynamic.GetNumChildren() > 1, + self.assertGreater( + window_dynamic.GetNumChildren(), + 1, "NSWindow (dynamic) only has 1 child!", ) self.assertTrue( diff --git a/lldb/test/API/lang/objcxx/objc-builtin-types/TestObjCBuiltinTypes.py b/lldb/test/API/lang/objcxx/objc-builtin-types/TestObjCBuiltinTypes.py index 698b16df78360..1eb7205f1bb46 100644 --- a/lldb/test/API/lang/objcxx/objc-builtin-types/TestObjCBuiltinTypes.py +++ b/lldb/test/API/lang/objcxx/objc-builtin-types/TestObjCBuiltinTypes.py @@ -35,7 +35,7 @@ def test_with_python_api(self): thread_list = lldbutil.get_threads_stopped_at_breakpoint(process, bpt) # Make sure we stopped at the first breakpoint. - self.assertTrue(len(thread_list) != 0, "No thread stopped at our breakpoint.") + self.assertNotEqual(len(thread_list), 0, "No thread stopped at our breakpoint.") self.assertEqual( len(thread_list), 1, "More than one thread stopped at our breakpoint." ) diff --git a/lldb/test/API/linux/aarch64/tagged_memory_access/TestAArch64LinuxTaggedMemoryAccess.py b/lldb/test/API/linux/aarch64/tagged_memory_access/TestAArch64LinuxTaggedMemoryAccess.py index c023e77e57d5a..1b52cd4b9c488 100644 --- a/lldb/test/API/linux/aarch64/tagged_memory_access/TestAArch64LinuxTaggedMemoryAccess.py +++ b/lldb/test/API/linux/aarch64/tagged_memory_access/TestAArch64LinuxTaggedMemoryAccess.py @@ -84,6 +84,4 @@ def test_tagged_memory_find(self): # First check we actually got something. self.assertRegex(out, "data found at location: 0x[0-9A-Fa-f]+") # Then that the location found does not display the tag bits. - self.assertNotRegexpMatches( - out, "data found at location: 0x(34|56)[0-9A-Fa-f]+" - ) + self.assertNotRegex(out, "data found at location: 0x(34|56)[0-9A-Fa-f]+") diff --git a/lldb/test/API/linux/aarch64/unwind_signal/TestUnwindSignal.py b/lldb/test/API/linux/aarch64/unwind_signal/TestUnwindSignal.py index 0ac43f2b08801..46f05b8285a51 100644 --- a/lldb/test/API/linux/aarch64/unwind_signal/TestUnwindSignal.py +++ b/lldb/test/API/linux/aarch64/unwind_signal/TestUnwindSignal.py @@ -32,8 +32,8 @@ def test_unwind_signal(self): self.assertTrue( thread and thread.IsValid(), "Thread should be stopped due to a signal" ) - self.assertTrue( - thread.GetStopReasonDataCount() >= 1, "There should be data in the event." + self.assertGreaterEqual( + thread.GetStopReasonDataCount(), 1, "There should be data in the event." ) self.assertEqual( thread.GetStopReasonDataAtIndex(0), diff --git a/lldb/test/API/lua_api/TestLuaAPI.py b/lldb/test/API/lua_api/TestLuaAPI.py index 4063e80264a27..4c9a5d9672c4c 100644 --- a/lldb/test/API/lua_api/TestLuaAPI.py +++ b/lldb/test/API/lua_api/TestLuaAPI.py @@ -186,4 +186,4 @@ def test_lua_api(self): print(out) print(err, file=sys.stderr) - self.assertTrue(exitCode == 0, "Lua test '%s' failure." % lua_test) + self.assertEqual(exitCode, 0, "Lua test '%s' failure." % lua_test) diff --git a/lldb/test/API/macosx/function-starts/TestFunctionStarts.py b/lldb/test/API/macosx/function-starts/TestFunctionStarts.py index 7b08c9a56ba83..80b042827fb11 100644 --- a/lldb/test/API/macosx/function-starts/TestFunctionStarts.py +++ b/lldb/test/API/macosx/function-starts/TestFunctionStarts.py @@ -64,8 +64,8 @@ def do_function_starts(self, in_memory): self.assertSuccess(error, "Didn't attach successfully to %d" % (popen.pid)) bkpt = target.BreakpointCreateByName("dont_strip_me", exe) - self.assertTrue( - bkpt.GetNumLocations() > 0, "Didn't set the dont_strip_me bkpt." + self.assertGreater( + bkpt.GetNumLocations(), 0, "Didn't set the dont_strip_me bkpt." ) threads = lldbutil.continue_to_breakpoint(process, bkpt) @@ -74,6 +74,6 @@ def do_function_starts(self, in_memory): # Our caller frame should have been stripped. Make sure we made a synthetic symbol # for it: thread = threads[0] - self.assertTrue(thread.num_frames > 1, "Couldn't backtrace.") + self.assertGreater(thread.num_frames, 1, "Couldn't backtrace.") name = thread.frame[1].GetFunctionName() self.assertTrue(name.startswith("___lldb_unnamed_symbol")) diff --git a/lldb/test/API/macosx/objc_exception_recognizer/TestObjCRecognizer.py b/lldb/test/API/macosx/objc_exception_recognizer/TestObjCRecognizer.py index c46c39346be4c..f49e7ca0837bb 100644 --- a/lldb/test/API/macosx/objc_exception_recognizer/TestObjCRecognizer.py +++ b/lldb/test/API/macosx/objc_exception_recognizer/TestObjCRecognizer.py @@ -41,8 +41,8 @@ def objc_recognizer_test(self, sub_class): exception_bkpt = target.BreakpointCreateForException( lldb.eLanguageTypeObjC, False, True ) - self.assertTrue( - exception_bkpt.GetNumLocations() > 0, "Got some exception locations" + self.assertGreater( + exception_bkpt.GetNumLocations(), 0, "Got some exception locations" ) threads = lldbutil.continue_to_breakpoint(process, exception_bkpt) diff --git a/lldb/test/API/macosx/profile_vrs_detach/TestDetachVrsProfile.py b/lldb/test/API/macosx/profile_vrs_detach/TestDetachVrsProfile.py index 0d8e23af98636..c2fa837ed01b8 100644 --- a/lldb/test/API/macosx/profile_vrs_detach/TestDetachVrsProfile.py +++ b/lldb/test/API/macosx/profile_vrs_detach/TestDetachVrsProfile.py @@ -49,7 +49,7 @@ def do_profile_and_detach(self): threads = lldbutil.continue_to_breakpoint(process, bkpt) self.assertEqual(len(threads), 1, "Hit our breakpoint again.") str = process.GetAsyncProfileData(1000) - self.assertTrue(len(str) > 0, "Got some profile data") + self.assertGreater(len(str), 0, "Got some profile data") # Now make the profiling interval very long and try to detach. interp.HandleCommand( diff --git a/lldb/test/API/macosx/queues/TestQueues.py b/lldb/test/API/macosx/queues/TestQueues.py index ff384fc48f25f..f2d15bb5ff15c 100644 --- a/lldb/test/API/macosx/queues/TestQueues.py +++ b/lldb/test/API/macosx/queues/TestQueues.py @@ -35,8 +35,9 @@ def setUp(self): self.main_source = "main.c" def check_queue_for_valid_queue_id(self, queue): - self.assertTrue( - queue.GetQueueID() != 0, + self.assertNotEqual( + queue.GetQueueID(), + 0, "Check queue %s for valid QueueID (got 0x%x)" % (queue.GetName(), queue.GetQueueID()), ) @@ -363,8 +364,8 @@ def queues_with_libBacktraceRecording(self): "Skipped because libBacktraceRecording.dylib was not loaded into the process." ) - self.assertTrue( - process.GetNumQueues() >= 4, "Found the correct number of queues." + self.assertGreaterEqual( + process.GetNumQueues(), 4, "Found the correct number of queues." ) queue_submittor_1 = lldb.SBQueue() @@ -456,8 +457,9 @@ def queues_with_libBacktraceRecording(self): "doing_the_work_2", "queue 2's pending item #0 should be doing_the_work_2", ) - self.assertTrue( - queue_performer_2.GetPendingItemAtIndex(9999).IsValid() == False, + self.assertEqual( + queue_performer_2.GetPendingItemAtIndex(9999).IsValid(), + False, "queue 2's pending item #9999 is invalid", ) diff --git a/lldb/test/API/macosx/universal/TestUniversal.py b/lldb/test/API/macosx/universal/TestUniversal.py index 8f5c4aa2a32c8..aecc8814b377e 100644 --- a/lldb/test/API/macosx/universal/TestUniversal.py +++ b/lldb/test/API/macosx/universal/TestUniversal.py @@ -144,8 +144,8 @@ def test_process_attach_with_wrong_arch(self): bkpt = target.BreakpointCreateBySourceRegex("sleep", lldb.SBFileSpec("main.c")) self.assertTrue(bkpt.IsValid(), "Valid breakpoint") - self.assertTrue( - bkpt.GetNumLocations() >= 1, "Our main breakpoint has locations." + self.assertGreaterEqual( + bkpt.GetNumLocations(), 1, "Our main breakpoint has locations." ) popen = self.spawnSubprocess(exe, ["keep_waiting"]) @@ -167,4 +167,4 @@ def test_process_attach_with_wrong_arch(self): threads = lldbutil.continue_to_breakpoint(process, bkpt) self.assertEqual(len(threads), 1) thread = threads[0] - self.assertTrue(thread.GetNumFrames() > 1, "We were able to backtrace.") + self.assertGreater(thread.GetNumFrames(), 1, "We were able to backtrace.") diff --git a/lldb/test/API/python_api/event/TestEvents.py b/lldb/test/API/python_api/event/TestEvents.py index 3017c86a113d7..a15f4357f4f5f 100644 --- a/lldb/test/API/python_api/event/TestEvents.py +++ b/lldb/test/API/python_api/event/TestEvents.py @@ -369,7 +369,7 @@ def test_shadow_listener(self): # Now create a breakpoint on main.c by name 'c'. bkpt1 = target.BreakpointCreateByName("c", "a.out") self.trace("breakpoint:", bkpt1) - self.assertTrue(bkpt1.GetNumLocations() == 1, VALID_BREAKPOINT) + self.assertEqual(bkpt1.GetNumLocations(), 1, VALID_BREAKPOINT) self.primary_listener = lldb.SBListener("my listener") self.shadow_listener = lldb.SBListener("shadow listener") @@ -431,11 +431,11 @@ def test_shadow_listener(self): main_spec = lldb.SBFileSpec("main.c") bkpt2 = target.BreakpointCreateBySourceRegex("b.2. returns %d", main_spec) - self.assertTrue(bkpt2.GetNumLocations() > 0, "BP2 worked") + self.assertGreater(bkpt2.GetNumLocations(), 0, "BP2 worked") bkpt2.SetAutoContinue(True) bkpt3 = target.BreakpointCreateBySourceRegex("a.3. returns %d", main_spec) - self.assertTrue(bkpt3.GetNumLocations() > 0, "BP3 worked") + self.assertGreater(bkpt3.GetNumLocations(), 0, "BP3 worked") state = lldb.eStateStopped restarted = False diff --git a/lldb/test/API/python_api/file_handle/TestFileHandle.py b/lldb/test/API/python_api/file_handle/TestFileHandle.py index c235be4bdd5ee..b38585577f6f6 100644 --- a/lldb/test/API/python_api/file_handle/TestFileHandle.py +++ b/lldb/test/API/python_api/file_handle/TestFileHandle.py @@ -682,25 +682,25 @@ def test_stdout_file(self): def test_identity(self): f = io.StringIO() sbf = lldb.SBFile(f) - self.assertTrue(f is sbf.GetFile()) + self.assertIs(f, sbf.GetFile()) sbf.Close() self.assertTrue(f.closed) f = io.StringIO() sbf = lldb.SBFile.Create(f, borrow=True) - self.assertTrue(f is sbf.GetFile()) + self.assertIs(f, sbf.GetFile()) sbf.Close() self.assertFalse(f.closed) with open(self.out_filename, "w") as f: sbf = lldb.SBFile(f) - self.assertTrue(f is sbf.GetFile()) + self.assertIs(f, sbf.GetFile()) sbf.Close() self.assertTrue(f.closed) with open(self.out_filename, "w") as f: sbf = lldb.SBFile.Create(f, borrow=True) - self.assertFalse(f is sbf.GetFile()) + self.assertIsNot(f, sbf.GetFile()) sbf.Write(b"foobar\n") self.assertEqual(f.fileno(), sbf.GetFile().fileno()) sbf.Close() @@ -711,7 +711,7 @@ def test_identity(self): with open(self.out_filename, "wb") as f: sbf = lldb.SBFile.Create(f, borrow=True, force_io_methods=True) - self.assertTrue(f is sbf.GetFile()) + self.assertIs(f, sbf.GetFile()) sbf.Write(b"foobar\n") self.assertEqual(f.fileno(), sbf.GetFile().fileno()) sbf.Close() @@ -722,7 +722,7 @@ def test_identity(self): with open(self.out_filename, "wb") as f: sbf = lldb.SBFile.Create(f, force_io_methods=True) - self.assertTrue(f is sbf.GetFile()) + self.assertIs(f, sbf.GetFile()) sbf.Write(b"foobar\n") self.assertEqual(f.fileno(), sbf.GetFile().fileno()) sbf.Close() diff --git a/lldb/test/API/python_api/findvalue_duplist/TestSBFrameFindValue.py b/lldb/test/API/python_api/findvalue_duplist/TestSBFrameFindValue.py index f989eb5b574b3..f8ab3a74df378 100644 --- a/lldb/test/API/python_api/findvalue_duplist/TestSBFrameFindValue.py +++ b/lldb/test/API/python_api/findvalue_duplist/TestSBFrameFindValue.py @@ -25,7 +25,7 @@ def test_formatters_api(self): breakpoint = target.BreakpointCreateBySourceRegex( "Set breakpoint here", lldb.SBFileSpec("main.cpp") ) - self.assertTrue(breakpoint.GetNumLocations() > 0, VALID_BREAKPOINT) + self.assertGreater(breakpoint.GetNumLocations(), 0, VALID_BREAKPOINT) # Launch the process, and do not stop at the entry point. process = target.LaunchSimple(None, None, self.get_process_working_directory()) diff --git a/lldb/test/API/python_api/format/TestFormat.py b/lldb/test/API/python_api/format/TestFormat.py index db20f02ba1c1e..625eadd86ffe8 100644 --- a/lldb/test/API/python_api/format/TestFormat.py +++ b/lldb/test/API/python_api/format/TestFormat.py @@ -19,6 +19,6 @@ def test_format(self): self.assertTrue(error.Fail()) format = lldb.SBFormat("${frame.index}", error) - self.assertIs(error.GetCString(), None) + self.assertIsNone(error.GetCString()) self.assertTrue(format) self.assertTrue(error.Success()) diff --git a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py index 8a811d25dac53..7e802f92da352 100644 --- a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py +++ b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py @@ -192,7 +192,7 @@ def cleanup(): ) self.assertTrue(foo_var.IsValid(), "could not find foo") - self.assertFalse(foo_var.GetNumChildren() == 2, "still seeing synthetic value") + self.assertNotEqual(foo_var.GetNumChildren(), 2, "still seeing synthetic value") filter = lldb.SBTypeFilter(0) filter.AppendExpressionPath("A") @@ -457,9 +457,8 @@ def cleanup(): "frame variable e2", substrs=["I am an empty Empty2 {}"], matching=False ) - self.assertTrue( - self.dbg.GetCategory(lldb.eLanguageTypeObjC) is not None, - "ObjC category is None", + self.assertIsNotNone( + self.dbg.GetCategory(lldb.eLanguageTypeObjC), "ObjC category is None" ) def test_force_synth_off(self): @@ -518,8 +517,8 @@ def cleanup(): int_vector = frame.FindVariable("int_vector") if self.TraceOn(): print(int_vector) - self.assertFalse( - int_vector.GetNumChildren() == 0, '"physical" vector is not empty' + self.assertNotEqual( + int_vector.GetNumChildren(), 0, '"physical" vector is not empty' ) self.runCmd("settings set target.enable-synthetic-value true") diff --git a/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py b/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py index 79714f6fd56c1..782edfa3c5e0d 100644 --- a/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py +++ b/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py @@ -56,9 +56,9 @@ def test(self): breakpoint2 = target.BreakpointCreateByLocation(self.source, line2) breakpoint3 = target.BreakpointCreateByLocation(self.source, line3) - self.assertTrue(breakpoint1.GetNumLocations() >= 1, PROCESS_IS_VALID) - self.assertTrue(breakpoint2.GetNumLocations() >= 1, PROCESS_IS_VALID) - self.assertTrue(breakpoint3.GetNumLocations() >= 1, PROCESS_IS_VALID) + self.assertGreaterEqual(breakpoint1.GetNumLocations(), 1, PROCESS_IS_VALID) + self.assertGreaterEqual(breakpoint2.GetNumLocations(), 1, PROCESS_IS_VALID) + self.assertGreaterEqual(breakpoint3.GetNumLocations(), 1, PROCESS_IS_VALID) # Register our shared libraries for remote targets so they get # automatically uploaded diff --git a/lldb/test/API/python_api/module_section/TestModuleAndSection.py b/lldb/test/API/python_api/module_section/TestModuleAndSection.py index 8a83740b06732..96b53563d5dfa 100644 --- a/lldb/test/API/python_api/module_section/TestModuleAndSection.py +++ b/lldb/test/API/python_api/module_section/TestModuleAndSection.py @@ -17,7 +17,7 @@ def test_module_and_section(self): target = self.dbg.CreateTarget(exe) self.assertTrue(target, VALID_TARGET) - self.assertTrue(target.GetNumModules() > 0) + self.assertGreater(target.GetNumModules(), 0) # Hide stdout if not running with '-t' option. if not self.TraceOn(): @@ -62,7 +62,7 @@ def test_module_and_section_boundary_condition(self): target = self.dbg.CreateTarget(exe) self.assertTrue(target, VALID_TARGET) - self.assertTrue(target.GetNumModules() > 0) + self.assertGreater(target.GetNumModules(), 0) # Hide stdout if not running with '-t' option. if not self.TraceOn(): @@ -102,7 +102,7 @@ def test_module_compile_unit_iter(self): target = self.dbg.CreateTarget(exe) self.assertTrue(target, VALID_TARGET) - self.assertTrue(target.GetNumModules() > 0) + self.assertGreater(target.GetNumModules(), 0) # Hide stdout if not running with '-t' option. if not self.TraceOn(): diff --git a/lldb/test/API/python_api/process/TestProcessAPI.py b/lldb/test/API/python_api/process/TestProcessAPI.py index 65330e5163f72..0b857fb033f1e 100644 --- a/lldb/test/API/python_api/process/TestProcessAPI.py +++ b/lldb/test/API/python_api/process/TestProcessAPI.py @@ -324,7 +324,7 @@ def test_remote_launch(self): if self.TraceOn(): print("process state:", state_type_to_str(process.GetState())) - self.assertTrue(process.GetState() != lldb.eStateConnected) + self.assertNotEqual(process.GetState(), lldb.eStateConnected) error = lldb.SBError() success = process.RemoteLaunch( diff --git a/lldb/test/API/python_api/process/io/TestProcessIO.py b/lldb/test/API/python_api/process/io/TestProcessIO.py index 381b06e422aff..5bb91d2758312 100644 --- a/lldb/test/API/python_api/process/io/TestProcessIO.py +++ b/lldb/test/API/python_api/process/io/TestProcessIO.py @@ -175,7 +175,7 @@ def run_process(self, put_stdin): self.breakpoint = self.target.BreakpointCreateBySourceRegex( "Set breakpoint here", lldb.SBFileSpec("main.c") ) - self.assertTrue(self.breakpoint.GetNumLocations() > 0, VALID_BREAKPOINT) + self.assertGreater(self.breakpoint.GetNumLocations(), 0, VALID_BREAKPOINT) # Launch the process, and do not stop at the entry point. error = lldb.SBError() @@ -223,11 +223,7 @@ def check_process_output(self, output, error): for line in self.lines: check_line = "input line to stdout: %s" % (line) - self.assertTrue( - check_line in output, "verify stdout line shows up in STDOUT" - ) + self.assertIn(check_line, output, "verify stdout line shows up in STDOUT") for line in self.lines: check_line = "input line to stderr: %s" % (line) - self.assertTrue( - check_line in error, "verify stderr line shows up in STDERR" - ) + self.assertIn(check_line, error, "verify stderr line shows up in STDERR") diff --git a/lldb/test/API/python_api/sbdata/TestSBData.py b/lldb/test/API/python_api/sbdata/TestSBData.py index c0db11d4e53ff..ef87d00f8240a 100644 --- a/lldb/test/API/python_api/sbdata/TestSBData.py +++ b/lldb/test/API/python_api/sbdata/TestSBData.py @@ -90,8 +90,8 @@ def test_with_run_command(self): self.assertTrue( (low == 9 and high == 0) or (low == 0 and high == 9), "foo[0].b == 9" ) - self.assertTrue( - fabs(data.GetFloat(error, offset) - 3.14) < 1, "foo[0].c == 3.14" + self.assertLess( + fabs(data.GetFloat(error, offset) - 3.14), 1, "foo[0].c == 3.14" ) self.assertSuccess(error) offset += 4 @@ -151,7 +151,7 @@ def test_with_run_command(self): self.assertEqual(data.uint32[0], 8, "then foo[1].a == 8") self.assertEqual(data.uint32[1], 7, "then foo[1].b == 7") # exploiting that sizeof(uint32) == sizeof(float) - self.assertTrue(fabs(data.float[2] - 3.14) < 1, "foo[1].c == 3.14") + self.assertLess(fabs(data.float[2] - 3.14), 1, "foo[1].c == 3.14") self.runCmd("n") @@ -160,8 +160,8 @@ def test_with_run_command(self): offset += 4 self.assert_data(data.GetUnsignedInt32, offset, 7) offset += 4 - self.assertTrue( - fabs(data.GetFloat(error, offset) - 3.14) < 1, "foo[1].c == 3.14" + self.assertLess( + fabs(data.GetFloat(error, offset) - 3.14), 1, "foo[1].c == 3.14" ) self.assertSuccess(error) @@ -172,8 +172,8 @@ def test_with_run_command(self): offset += 4 self.assert_data(data.GetUnsignedInt32, offset, 7) offset += 4 - self.assertTrue( - fabs(data.GetFloat(error, offset) - 6.28) < 1, "foo[1].c == 6.28" + self.assertLess( + fabs(data.GetFloat(error, offset) - 6.28), 1, "foo[1].c == 6.28" ) self.assertSuccess(error) @@ -187,14 +187,14 @@ def test_with_run_command(self): offset += 4 self.assert_data(data.GetUnsignedInt32, offset, 2) offset += 4 - self.assertTrue(fabs(data.GetFloat(error, offset) - 3) < 1, "barfoo[0].c == 3") + self.assertLess(fabs(data.GetFloat(error, offset) - 3), 1, "barfoo[0].c == 3") self.assertSuccess(error) offset += 4 self.assert_data(data.GetUnsignedInt32, offset, 4) offset += 4 self.assert_data(data.GetUnsignedInt32, offset, 5) offset += 4 - self.assertTrue(fabs(data.GetFloat(error, offset) - 6) < 1, "barfoo[1].c == 6") + self.assertLess(fabs(data.GetFloat(error, offset) - 6), 1, "barfoo[1].c == 6") self.assertSuccess(error) new_object = barfoo.CreateValueFromData( @@ -332,16 +332,16 @@ def test_with_run_command(self): data2 = lldb.SBData.CreateDataFromDoubleArray( process.GetByteOrder(), process.GetAddressByteSize(), [3.14, 6.28, 2.71] ) - self.assertTrue( - fabs(data2.GetDouble(error, 0) - 3.14) < 0.5, "double data2[0] = 3.14" + self.assertLess( + fabs(data2.GetDouble(error, 0) - 3.14), 0.5, "double data2[0] = 3.14" ) self.assertSuccess(error) - self.assertTrue( - fabs(data2.GetDouble(error, 8) - 6.28) < 0.5, "double data2[1] = 6.28" + self.assertLess( + fabs(data2.GetDouble(error, 8) - 6.28), 0.5, "double data2[1] = 6.28" ) self.assertSuccess(error) - self.assertTrue( - fabs(data2.GetDouble(error, 16) - 2.71) < 0.5, "double data2[2] = 2.71" + self.assertLess( + fabs(data2.GetDouble(error, 16) - 2.71), 0.5, "double data2[2] = 2.71" ) self.assertSuccess(error) @@ -380,8 +380,9 @@ def test_with_run_command(self): data2.uint64[4], 5, "read_data_helper failure: set data2[4] = 5" ) - self.assertTrue( - data2.uint64[0:2] == [1, 2], + self.assertEqual( + data2.uint64[0:2], + [1, 2], "read_data_helper failure: set data2[0:2] = [1,2]", ) @@ -417,26 +418,29 @@ def test_with_run_command(self): ) data2.SetDataFromDoubleArray([3.14, 6.28, 2.71]) - self.assertTrue( - fabs(data2.GetDouble(error, 0) - 3.14) < 0.5, "set double data2[0] = 3.14" + self.assertLess( + fabs(data2.GetDouble(error, 0) - 3.14), 0.5, "set double data2[0] = 3.14" ) - self.assertTrue( - fabs(data2.GetDouble(error, 8) - 6.28) < 0.5, "set double data2[1] = 6.28" + self.assertLess( + fabs(data2.GetDouble(error, 8) - 6.28), 0.5, "set double data2[1] = 6.28" ) - self.assertTrue( - fabs(data2.GetDouble(error, 16) - 2.71) < 0.5, "set double data2[2] = 2.71" + self.assertLess( + fabs(data2.GetDouble(error, 16) - 2.71), 0.5, "set double data2[2] = 2.71" ) - self.assertTrue( - fabs(data2.double[0] - 3.14) < 0.5, + self.assertLess( + fabs(data2.double[0] - 3.14), + 0.5, "read_data_helper failure: set double data2[0] = 3.14", ) - self.assertTrue( - fabs(data2.double[1] - 6.28) < 0.5, + self.assertLess( + fabs(data2.double[1] - 6.28), + 0.5, "read_data_helper failure: set double data2[1] = 6.28", ) - self.assertTrue( - fabs(data2.double[2] - 2.71) < 0.5, + self.assertLess( + fabs(data2.double[2] - 2.71), + 0.5, "read_data_helper failure: set double data2[2] = 2.71", ) @@ -452,7 +456,8 @@ def assert_data(self, func, arg, expected): "%s(error, %s) did not succeed: %s" % (func.__name__, arg, stream.GetData()), ) - self.assertTrue( - expected == result, + self.assertEqual( + expected, + result, "%s(error, %s) == %s != %s" % (func.__name__, arg, result, expected), ) diff --git a/lldb/test/API/python_api/sbmodule/TestSBModule.py b/lldb/test/API/python_api/sbmodule/TestSBModule.py index 69a20b32a058a..c04e2fa55e8cf 100644 --- a/lldb/test/API/python_api/sbmodule/TestSBModule.py +++ b/lldb/test/API/python_api/sbmodule/TestSBModule.py @@ -51,7 +51,7 @@ def test_module_is_file_backed(self): ) self.assertTrue(error.Success() and process, PROCESS_IS_VALID) main_module = target.FindModule(lldb.SBFileSpec("a.out")) - self.assertTrue(main_module is not None) + self.assertIsNotNone(main_module) self.assertFalse( main_module.IsFileBacked(), "The module should not be backed by a file on disk.", diff --git a/lldb/test/API/python_api/target/TestTargetAPI.py b/lldb/test/API/python_api/target/TestTargetAPI.py index 63d34340a8836..2e8d6a5b1e53f 100644 --- a/lldb/test/API/python_api/target/TestTargetAPI.py +++ b/lldb/test/API/python_api/target/TestTargetAPI.py @@ -112,7 +112,7 @@ def test_get_ABIName(self): target = self.create_simple_target("b.out") abi_pre_launch = target.GetABIName() - self.assertTrue(len(abi_pre_launch) != 0, "Got an ABI string") + self.assertNotEqual(len(abi_pre_launch), 0, "Got an ABI string") breakpoint = target.BreakpointCreateByLocation("main.c", self.line_main) self.assertTrue(breakpoint, VALID_BREAKPOINT) diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py index 8d82f7b022874..e75affd652211 100644 --- a/lldb/test/API/python_api/type/TestTypeList.py +++ b/lldb/test/API/python_api/type/TestTypeList.py @@ -54,7 +54,7 @@ def test(self): % type_list.GetSize() ) # a second Task make be scared up by the Objective-C runtime - self.assertTrue(len(type_list) >= 1) + self.assertGreaterEqual(len(type_list), 1) for type in type_list: self.assertTrue(type) self.DebugSBType(type) @@ -133,7 +133,7 @@ def test(self): self.DebugSBType(union_type) # Check that we don't find indirectly nested types - self.assertTrue(enum_type.size == 1) + self.assertEqual(enum_type.size, 1) invalid_type = task_type.FindDirectNestedType("E2") self.assertFalse(invalid_type) diff --git a/lldb/test/API/python_api/value/change_values/TestChangeValueAPI.py b/lldb/test/API/python_api/value/change_values/TestChangeValueAPI.py index 07250eb6a4830..7d9aa76b9eef9 100644 --- a/lldb/test/API/python_api/value/change_values/TestChangeValueAPI.py +++ b/lldb/test/API/python_api/value/change_values/TestChangeValueAPI.py @@ -128,7 +128,7 @@ def test_change_value(self): "Val - 12345 Mine - 55, 98765, 55555555. Ptr - 66, 98765, 66666666" ) stdout = process.GetSTDOUT(1000) - self.assertTrue(expected_value in stdout, "STDOUT showed changed values.") + self.assertIn(expected_value, stdout, "STDOUT showed changed values.") # Finally, change the stack pointer to 0, and we should not make it to # our end breakpoint. @@ -150,8 +150,8 @@ def test_change_value(self): self.assertState(process.GetState(), lldb.eStateStopped) thread = lldbutil.get_stopped_thread(process, lldb.eStopReasonBreakpoint) - self.assertTrue( - thread is None, + self.assertIsNone( + thread, "We should not have managed to hit our second breakpoint with sp == 1", ) diff --git a/lldb/test/API/python_api/value/change_values/libcxx/atomic/TestChangeValue.py b/lldb/test/API/python_api/value/change_values/libcxx/atomic/TestChangeValue.py index 757965f680a67..c429eea501e2b 100644 --- a/lldb/test/API/python_api/value/change_values/libcxx/atomic/TestChangeValue.py +++ b/lldb/test/API/python_api/value/change_values/libcxx/atomic/TestChangeValue.py @@ -47,4 +47,4 @@ def test(self): result = inner_val.SetValueFromCString("42") self.assertTrue(result, "Setting val returned True.") result = inner_val.GetValueAsUnsigned() - self.assertTrue(result == 42, "Got correct value (42)") + self.assertEqual(result, 42, "Got correct value (42)") diff --git a/lldb/test/API/python_api/value/change_values/libcxx/map/TestChangeMapValue.py b/lldb/test/API/python_api/value/change_values/libcxx/map/TestChangeMapValue.py index 2cbfa04babb18..cd445e8c772de 100644 --- a/lldb/test/API/python_api/value/change_values/libcxx/map/TestChangeMapValue.py +++ b/lldb/test/API/python_api/value/change_values/libcxx/map/TestChangeMapValue.py @@ -44,10 +44,10 @@ def test(self): self.assertTrue(val_value.IsValid(), "Got the SBValue for val") pair0 = val_value.GetChildMemberWithName("[0]") self.assertTrue(pair0.IsValid(), "Got the SBValue for [0]") - self.assertTrue(pair0.GetNumChildren() == 2, "Got 2 children") + self.assertEqual(pair0.GetNumChildren(), 2, "Got 2 children") pair0_second = pair0.GetChildMemberWithName("second") self.assertTrue(pair0_second.IsValid(), "Got the SBValue for [0].second") result = pair0_second.SetValueFromCString("12345") self.assertTrue(result, "Setting val returned True.") result = pair0_second.GetValueAsUnsigned() - self.assertTrue(result == 12345, "Got correct value (12345)") + self.assertEqual(result, 12345, "Got correct value (12345)") diff --git a/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py b/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py index 5702b9934f329..f7c22fb9c2bd6 100644 --- a/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py +++ b/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py @@ -66,7 +66,7 @@ def test_watch_iter(self): self.assertEqual(target.GetNumWatchpoints(), 1) self.assertTrue(watchpoint.IsEnabled()) watch_id = watchpoint.GetID() - self.assertTrue(watch_id != 0) + self.assertNotEqual(watch_id, 0) # Continue. Expect the program to stop due to the variable being # written to. diff --git a/lldb/test/API/source-manager/TestSourceManager.py b/lldb/test/API/source-manager/TestSourceManager.py index 7569bb79257d1..eab8924d10814 100644 --- a/lldb/test/API/source-manager/TestSourceManager.py +++ b/lldb/test/API/source-manager/TestSourceManager.py @@ -259,7 +259,7 @@ def test_modify_source_file_while_debugging(self): m = re.search("^\[(\d+)\].*// Set break point at this line.", output) if not m: self.fail("Fail to display source level breakpoints") - self.assertTrue(int(m.group(1)) > 0) + self.assertGreater(int(m.group(1)), 0) # Modify content self.modify_content() diff --git a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py index 3781f651325d3..13190a50954ad 100644 --- a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py +++ b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py @@ -70,4 +70,4 @@ def test_core_file_source_mapping(self): source_map = [["/home/labath/test", current_dir]] self.attach(exe_file, coreFile=core_file, sourceMap=source_map) - self.assertTrue(current_dir in self.get_stackFrames()[0]["source"]["path"]) + self.assertIn(current_dir, self.get_stackFrames()[0]["source"]["path"]) diff --git a/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py b/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py index cb4e946c52112..1b96ea71659f9 100644 --- a/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py +++ b/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py @@ -31,11 +31,11 @@ def test_disassemble(self): self.continue_to_next_stop() pc_assembly = self.disassemble(frameIndex=0) - self.assertTrue("location" in pc_assembly, "Source location missing.") - self.assertTrue("instruction" in pc_assembly, "Assembly instruction missing.") + self.assertIn("location", pc_assembly, "Source location missing.") + self.assertIn("instruction", pc_assembly, "Assembly instruction missing.") # The calling frame (qsort) is coming from a system library, as a result # we should not have a source location. qsort_assembly = self.disassemble(frameIndex=1) - self.assertFalse("location" in qsort_assembly, "Source location not expected.") - self.assertTrue("instruction" in pc_assembly, "Assembly instruction missing.") + self.assertNotIn("location", qsort_assembly, "Source location not expected.") + self.assertIn("instruction", pc_assembly, "Assembly instruction missing.") diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py index a72571898ab50..f79a31988dc6c 100644 --- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py +++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py @@ -78,7 +78,7 @@ def test_runInTerminal(self): # We verify we actually stopped inside the loop counter = int(self.dap_server.get_local_variable_value("counter")) - self.assertTrue(counter > 0) + self.assertGreater(counter, 0) # We verify we were able to set the launch arguments argc = int(self.dap_server.get_local_variable_value("argc")) @@ -122,7 +122,7 @@ def test_missingArgInRunInTerminalLauncher(self): capture_output=True, universal_newlines=True, ) - self.assertTrue(proc.returncode != 0) + self.assertNotEqual(proc.returncode, 0) self.assertIn( '"--launch-target" requires "--comm-file" to be specified', proc.stderr ) diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py index a51b2173f027d..70526cc715388 100644 --- a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py +++ b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py @@ -83,8 +83,8 @@ def test_stackTrace(self): # Verify we get all stack frames with no arguments (stackFrames, totalFrames) = self.get_stackFrames_and_totalFramesCount() frameCount = len(stackFrames) - self.assertTrue( - frameCount >= 20, "verify we get at least 20 frames for all frames" + self.assertGreaterEqual( + frameCount, 20, "verify we get at least 20 frames for all frames" ) self.assertEqual( totalFrames, frameCount, "verify we get correct value for totalFrames count" diff --git a/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py b/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py index 80a15dd4c9ab7..ff5081a41424f 100644 --- a/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py +++ b/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py @@ -45,16 +45,16 @@ def test_terminated_event(self): self.continue_to_exit() statistics = self.dap_server.wait_for_terminated()["statistics"] - self.assertTrue(statistics["totalDebugInfoByteSize"] > 0) - self.assertTrue(statistics["totalDebugInfoEnabled"] > 0) - self.assertTrue(statistics["totalModuleCountHasDebugInfo"] > 0) + self.assertGreater(statistics["totalDebugInfoByteSize"], 0) + self.assertGreater(statistics["totalDebugInfoEnabled"], 0) + self.assertGreater(statistics["totalModuleCountHasDebugInfo"], 0) self.assertIsNotNone(statistics["memory"]) self.assertNotIn("modules", statistics.keys()) # lldb-dap debugs one target at a time target = json.loads(statistics["targets"])[0] - self.assertTrue(target["totalBreakpointResolveTime"] > 0) + self.assertGreater(target["totalBreakpointResolveTime"], 0) breakpoints = target["breakpoints"] self.assertIn( diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py index 1f7dd7b2c42a8..d886d0776ce58 100644 --- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py +++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py @@ -58,14 +58,14 @@ def verify_values(self, verify_dict, actual, varref_dict=None, expression=None): for key in verify: contains_array = verify[key] actual_value = actual[key] - self.assertTrue(isinstance(contains_array, list)) + self.assertIsInstance(contains_array, list) for verify_value in contains_array: self.assertIn(verify_value, actual_value) if "missing" in verify_dict: missing = verify_dict["missing"] for key in missing: - self.assertTrue( - key not in actual, 'key "%s" is not expected in %s' % (key, actual) + self.assertNotIn( + key, actual, 'key "%s" is not expected in %s' % (key, actual) ) hasVariablesReference = "variablesReference" in actual varRef = None @@ -727,8 +727,8 @@ def test_registers(self): if reg["name"] == pc_name: value = reg["value"] self.assertTrue(value.startswith("0x")) - self.assertTrue("a.out`main + " in value) - self.assertTrue("at main.cpp:" in value) + self.assertIn("a.out`main + ", value) + self.assertIn("at main.cpp:", value) @no_debug_info_test @skipUnlessDarwin diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py index 38900353290bb..5aa790b1c97e5 100644 --- a/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py @@ -17,7 +17,7 @@ def test_attach_with_vAttach(self): # Make sure the target process has been launched. inferior = procs.get("inferior") self.assertIsNotNone(inferior) - self.assertTrue(inferior.pid > 0) + self.assertGreater(inferior.pid, 0) self.assertTrue(lldbgdbserverutils.process_is_running(inferior.pid, True)) # Add attach packets. diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py index 999d8ce9d467c..25ad600c292d1 100644 --- a/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteAuxvSupport.py @@ -117,8 +117,8 @@ def test_auxv_keys_look_valid(self): # small (usually smaller than 50), they can sometimes be larger. self.trace("auxv dict: {}".format(auxv_dict)) for auxv_key in auxv_dict: - self.assertTrue(auxv_key >= 1) - self.assertTrue(auxv_key <= 2500) + self.assertGreaterEqual(auxv_key, 1) + self.assertLessEqual(auxv_key, 2500) @skipIfWindows @expectedFailureNetBSD diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py index 474fd7f2d9993..ddddc9ee72fcf 100644 --- a/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteExpeditedRegisters.py @@ -44,7 +44,7 @@ def stop_notification_contains_generic_register(self, generic_register_name): # notification. expedited_registers = self.gather_expedited_registers() self.assertIsNotNone(expedited_registers) - self.assertTrue(len(expedited_registers) > 0) + self.assertGreater(len(expedited_registers), 0) # Gather target register infos. reg_infos = self.gather_register_infos() @@ -67,7 +67,7 @@ def test_stop_notification_contains_any_registers(self): # notification. expedited_registers = self.gather_expedited_registers() # Verify we have at least one expedited register. - self.assertTrue(len(expedited_registers) > 0) + self.assertGreater(len(expedited_registers), 0) def test_stop_notification_contains_no_duplicate_registers(self): self.build() @@ -113,7 +113,7 @@ def test_stop_notification_contains_vg_register(self): # notification. expedited_registers = self.gather_expedited_registers() self.assertIsNotNone(expedited_registers) - self.assertTrue(len(expedited_registers) > 0) + self.assertGreater(len(expedited_registers), 0) # Gather target register infos. reg_infos = self.gather_register_infos() diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py index 176dd18b18a22..39aa473322a9f 100644 --- a/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py @@ -38,7 +38,7 @@ def grp_register_save_restore_works(self, with_suffix): for reg_info in reg_infos if self.is_bit_flippable_register(reg_info) ] - self.assertTrue(len(gpr_reg_infos) > 0) + self.assertGreater(len(gpr_reg_infos), 0) # Gather thread info. if with_suffix: @@ -77,7 +77,7 @@ def grp_register_save_restore_works(self, with_suffix): successful_writes, failed_writes ) ) - self.assertTrue(successful_writes > 0) + self.assertGreater(successful_writes, 0) flipped_reg_values = self.read_register_values( gpr_reg_infos, endian, thread_id=thread_id diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py index 46ed59808eb66..32b36bc04c1a3 100644 --- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py +++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py @@ -214,7 +214,7 @@ def test_qRegisterInfo_contains_at_least_one_register_set(self): register_sets = { reg_info["set"]: 1 for reg_info in reg_infos if "set" in reg_info } - self.assertTrue(len(register_sets) >= 1) + self.assertGreaterEqual(len(register_sets), 1) def targetHasAVX(self): triple = self.dbg.GetSelectedPlatform().GetTriple() @@ -344,7 +344,7 @@ def test_p_returns_correct_data_size_for_each_qRegisterInfo_launch(self): # Gather register info entries. reg_infos = self.parse_register_info_packets(context) self.assertIsNotNone(reg_infos) - self.assertTrue(len(reg_infos) > 0) + self.assertGreater(len(reg_infos), 0) byte_order = self.get_target_byte_order() @@ -1072,7 +1072,7 @@ def get_qSupported_dict(self, features=[]): def test_qSupported_returns_known_stub_features(self): supported_dict = self.get_qSupported_dict() self.assertIsNotNone(supported_dict) - self.assertTrue(len(supported_dict) > 0) + self.assertGreater(len(supported_dict), 0) def test_qSupported_auvx(self): expected = ( @@ -1267,7 +1267,7 @@ def test_P_writes_all_gpr_registers(self): for reg_info in reg_infos if self.is_bit_flippable_register(reg_info) ] - self.assertTrue(len(gpr_reg_infos) > 0) + self.assertGreater(len(gpr_reg_infos), 0) # Write flipped bit pattern of existing value to each register. (successful_writes, failed_writes) = self.flip_all_bits_in_each_register_value( @@ -1278,7 +1278,7 @@ def test_P_writes_all_gpr_registers(self): successful_writes, failed_writes ) ) - self.assertTrue(successful_writes > 0) + self.assertGreater(successful_writes, 0) # Note: as of this moment, a hefty number of the GPR writes are failing # with E32 (everything except rax-rdx, rdi, rsi, rbp). @@ -1310,7 +1310,7 @@ def test_P_and_p_thread_suffix_work(self): reg_index = self.select_modifiable_register(reg_infos) self.assertIsNotNone(reg_index) reg_byte_size = int(reg_infos[reg_index]["bitsize"]) // 8 - self.assertTrue(reg_byte_size > 0) + self.assertGreater(reg_byte_size, 0) expected_reg_values = [] register_increment = 1 diff --git a/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py b/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py index 24548100de7d2..f4c31fe2f5c07 100644 --- a/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py +++ b/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py @@ -26,7 +26,7 @@ def _set_up_inferior(self): def _launch_inferior(self, args): inferior = self.spawnSubprocess(self.getBuildArtifact(self._exe_to_run), args) self.assertIsNotNone(inferior) - self.assertTrue(inferior.pid > 0) + self.assertGreater(inferior.pid, 0) self.assertTrue(lldbgdbserverutils.process_is_running(inferior.pid, True)) return inferior diff --git a/lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py index 2a7f1f02155ab..bd78a83c65655 100644 --- a/lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py +++ b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py @@ -46,7 +46,7 @@ def test_g_target_xml_returns_correct_data(self): self.assertIsNotNone(feature) target_xml_registers = feature.findall("reg") - self.assertTrue(len(target_xml_registers) > 0) + self.assertGreater(len(target_xml_registers), 0) # registers info collected by qRegisterInfo self.add_register_info_collection_packets() From 1c81b4a8f237db286aae5b8893aea09676a41d68 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 21 Feb 2024 14:20:07 -0500 Subject: [PATCH 139/351] [gn] port 0a518db99e0c (clang-installapi) --- .../gn/secondary/clang/lib/Frontend/BUILD.gn | 3 --- .../gn/secondary/clang/lib/InstallAPI/BUILD.gn | 1 - llvm/utils/gn/secondary/clang/test/BUILD.gn | 1 + .../clang/tools/clang-installapi/BUILD.gn | 18 ++++++++++++++++++ 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn index 5c4d7e1f4f559..948d1405676b7 100644 --- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn @@ -8,7 +8,6 @@ static_library("Frontend") { "//clang/lib/Basic", "//clang/lib/Driver", "//clang/lib/Edit", - "//clang/lib/InstallAPI", "//clang/lib/Lex", "//clang/lib/Parse", "//clang/lib/Sema", @@ -19,7 +18,6 @@ static_library("Frontend") { "//llvm/lib/ProfileData", "//llvm/lib/Support", "//llvm/lib/TargetParser", - "//llvm/lib/TextAPI", ] sources = [ "ASTConsumers.cpp", @@ -40,7 +38,6 @@ static_library("Frontend") { "InitPreprocessor.cpp", "InterfaceStubFunctionsConsumer.cpp", "LayoutOverrideSource.cpp", - "InstallAPIConsumer.cpp", "LogDiagnosticPrinter.cpp", "ModuleDependencyCollector.cpp", "MultiplexConsumer.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn index 6eae7e293dce6..fbff113613d26 100644 --- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn @@ -7,7 +7,6 @@ static_library("InstallAPI") { "//llvm/lib/TextAPI", ] sources = [ - "Context.cpp", "FileList.cpp", "HeaderFile.cpp", ] diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn index c7df8039723b1..bcf7c86ab9d08 100644 --- a/llvm/utils/gn/secondary/clang/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn @@ -157,6 +157,7 @@ group("test") { "//clang/tools/clang-format", "//clang/tools/clang-fuzzer/dictionary:clang-fuzzer-dictionary", "//clang/tools/clang-import-test", + "//clang/tools/clang-installapi", "//clang/tools/clang-offload-bundler", "//clang/tools/clang-refactor", "//clang/tools/clang-rename", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn new file mode 100644 index 0000000000000..4f6895181f552 --- /dev/null +++ b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn @@ -0,0 +1,18 @@ +import("//llvm/utils/gn/build/driver_executable.gni") + +driver_executable("clang-installapi") { + configs += [ "//llvm/utils/gn/build:clang_code" ] + deps = [ + "//clang/lib/Driver", + "//clang/lib/Frontend", + "//clang/lib/InstallAPI", + "//clang/lib/Tooling", + "//llvm/lib/Support", + "//llvm/lib/TargetParser", + "//llvm/lib/TextAPI", + ] + sources = [ + "ClangInstallAPI.cpp", + "Options.cpp", + ] +} From ddc0f1d8fed4f1a1742598ffd7dc3195bb37a8f1 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 19:22:19 +0000 Subject: [PATCH 140/351] [TargetLowering] Actually add the adjustment to the significand The logic was supposed to be choosing between {0, 1, -1} as an adjustment to the FP bit pattern. However, the adjustment itself was used as the bit pattern instead which result in garbage results. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 11 ++-- llvm/test/CodeGen/AMDGPU/bf16.ll | 60 ++++++++++--------- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d059dc66d0588..bde1fff4e1ca7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10895,15 +10895,17 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op, EVT ResultIntVTCCVT = getSetCCResultType( DAG.getDataLayout(), *DAG.getContext(), And.getValueType()); SDValue Zero = DAG.getConstant(0, dl, ResultIntVT); + // The result is already odd so we don't need to do anything. SDValue AlreadyOdd = DAG.getSetCC(dl, ResultIntVTCCVT, And, Zero, ISD::SETNE); EVT WideSetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), AbsWide.getValueType()); + // We keep results which are exact, odd or NaN. SDValue KeepNarrow = DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETUEQ); KeepNarrow = DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd); - // We morally performed a round-down if `abs_narrow` is smaller than - // `abs_wide`. + // We morally performed a round-down if AbsNarrow is smaller than + // AbsWide. SDValue NarrowIsRd = DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETOGT); // If the narrow value is odd or exact, pick it. @@ -10911,12 +10913,13 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op, // or rounded-down value. If narrow is the rounded-down value, we want // the rounded-up value as it will be odd. SDValue Adjust = DAG.getSelect(dl, ResultIntVT, NarrowIsRd, One, NegativeOne); - Adjust = DAG.getSelect(dl, ResultIntVT, KeepNarrow, Zero, Adjust); + SDValue Adjusted = DAG.getNode(ISD::ADD, dl, ResultIntVT, NarrowBits, Adjust); + Op = DAG.getSelect(dl, ResultIntVT, KeepNarrow, NarrowBits, Adjusted); int ShiftAmount = BitSize - ResultVT.getScalarSizeInBits(); SDValue ShiftCnst = DAG.getShiftAmountConstant(ShiftAmount, WideIntVT, dl); SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst); SignBit = DAG.getNode(ISD::TRUNCATE, dl, ResultIntVT, SignBit); - Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Adjust, SignBit); + Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Op, SignBit); return DAG.getNode(ISD::BITCAST, dl, ResultVT, Op); } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index e841a8867fc52..67538f26c550b 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2281,13 +2281,14 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5] -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v8, 1, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5] +; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4 +; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-NEXT: v_or_b32_e32 v5, v4, v7 ; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 @@ -2310,14 +2311,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5] +; GFX9-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5] -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4 ; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9 @@ -2335,15 +2337,16 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5] +; GFX10-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5] -; GFX10-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] +; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4 +; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0x400000 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] ; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 ; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff @@ -2360,23 +2363,24 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX11-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] ; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5] -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0x400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] ; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 ; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff ; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v5, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] From 3d66d6932e26199f72766b6554d1c4878246ec6e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 Feb 2024 19:37:15 +0000 Subject: [PATCH 141/351] [VPlan] Support live-ins without underlying IR in type analysis. (#80723) A VPlan contains multiple live-ins without underlying IR, like VFxUF or VectorTripCount. Trying to infer the scalar type of those causes a crash at the moment. Update VPTypeAnalysis to take a VPlan in its constructor and assign types to those live-ins up front. All those live-ins share the type of the canonical IV. PR: https://github.com/llvm/llvm-project/pull/80723 --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 8 ++++++++ llvm/lib/Transforms/Vectorize/VPlan.h | 4 +--- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 16 ++++++++-------- llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 7 ++++++- .../lib/Transforms/Vectorize/VPlanTransforms.cpp | 11 +++++++---- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index e55db2df82b47..56310dc11786c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -212,6 +212,14 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { return It; } +VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, + DominatorTree *DT, IRBuilderBase &Builder, + InnerLoopVectorizer *ILV, VPlan *Plan, + LLVMContext &Ctx) + : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), + LVer(nullptr), + TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {} + Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { if (Def->isLiveIn()) return Def->getLiveInIRValue(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a3ecdb99e9d9f..240d4bd628b05 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -236,9 +236,7 @@ struct VPIteration { struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx) - : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr), TypeAnalysis(Ctx) {} + InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx); /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b9ffe7e5b7af7..f55beac2047c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -35,12 +35,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { CachedTypes[OtherV] = ResTy; return ResTy; } - case Instruction::ICmp: { - // TODO: Check if types for both operands agree. This also requires - // type-inference for the vector-trip-count, which is missing at the moment. - Type *ResTy = inferScalarType(R->getOperand(0)); - return ResTy; - } + case Instruction::ICmp: case VPInstruction::FirstOrderRecurrenceSplice: { Type *ResTy = inferScalarType(R->getOperand(0)); VPValue *OtherV = R->getOperand(1); @@ -207,8 +202,13 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { if (Type *CachedTy = CachedTypes.lookup(V)) return CachedTy; - if (V->isLiveIn()) - return V->getLiveInIRValue()->getType(); + if (V->isLiveIn()) { + if (auto *IRValue = V->getLiveInIRValue()) + return IRValue->getType(); + // All VPValues without any underlying IR value (like the vector trip count + // or the backedge-taken count) have the same type as the canonical IV. + return CanonicalIVTy; + } Type *ResultTy = TypeSwitch(V->getDefiningRecipe()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index 7276641551ae8..4e69de7fd6812 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -35,6 +35,10 @@ class Type; /// of the previously inferred types. class VPTypeAnalysis { DenseMap CachedTypes; + /// Type of the canonical induction variable. Used for all VPValues without + /// any underlying IR value (like the vector trip count or the backedge-taken + /// count). + Type *CanonicalIVTy; LLVMContext &Ctx; Type *inferScalarTypeForRecipe(const VPBlendRecipe *R); @@ -47,7 +51,8 @@ class VPTypeAnalysis { Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R); public: - VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {} + VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx) + : CanonicalIVTy(CanonicalIVTy), Ctx(Ctx) {} /// Infer the type of \p V. Returns the scalar type of \p V. Type *inferScalarType(const VPValue *V); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3d44342102420..9c3f35112b592 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -513,7 +513,8 @@ static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID, } // Truncate base induction if needed. - VPTypeAnalysis TypeInfo(SE.getContext()); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), + SE.getContext()); Type *ResultTy = TypeInfo.inferScalarType(BaseIV); if (TruncI) { Type *TruncTy = TruncI->getType(); @@ -897,7 +898,9 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { #ifndef NDEBUG // Verify that the cached type info is for both A and its users is still // accurate by comparing it to freshly computed types. - VPTypeAnalysis TypeInfo2(TypeInfo.getContext()); + VPTypeAnalysis TypeInfo2( + R.getParent()->getPlan()->getCanonicalIV()->getScalarType(), + TypeInfo.getContext()); assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); for (VPUser *U : A->users()) { auto *R = dyn_cast(U); @@ -918,7 +921,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); - VPTypeAnalysis TypeInfo(Ctx); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { simplifyRecipe(R, TypeInfo); @@ -939,7 +942,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( // other uses have different types for their operands, making them invalidly // typed. DenseMap ProcessedTruncs; - VPTypeAnalysis TypeInfo(Ctx); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); VPBasicBlock *PH = Plan.getEntry(); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { From 22cf983387e52e3df01504c69df8266e9d80d9da Mon Sep 17 00:00:00 2001 From: calebwat <107081575+calebwat@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:38:26 -0800 Subject: [PATCH 142/351] [VPlan] Use opaque pointers in VPlan unit test IR (#69947) Updates the unit tests for VPlan to use opaque pointers in strings containing LLVM IR. This is to match the similar adjustments being made for lit tests to use opaque pointers. --- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 16 +- .../Transforms/Vectorize/VPlanSlpTest.cpp | 384 +++++++++--------- 2 files changed, 200 insertions(+), 200 deletions(-) diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 88ceb5952c6a5..be8be7acbe388 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -21,15 +21,15 @@ class VPlanHCFGTest : public VPlanTestBase {}; TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { const char *ModuleString = - "define void @f(i32* %A, i64 %N) {\n" + "define void @f(ptr %A, i64 %N) {\n" "entry:\n" " br label %for.body\n" "for.body:\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %arr.idx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv\n" - " %l1 = load i32, i32* %arr.idx, align 4\n" + " %arr.idx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\n" + " %l1 = load i32, ptr %arr.idx, align 4\n" " %res = add i32 %l1, 10\n" - " store i32 %res, i32* %arr.idx, align 4\n" + " store i32 %res, ptr %arr.idx, align 4\n" " %indvars.iv.next = add i64 %indvars.iv, 1\n" " %exitcond = icmp ne i64 %indvars.iv.next, %N\n" " br i1 %exitcond, label %for.body, label %for.end\n" @@ -148,15 +148,15 @@ compound=true TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { const char *ModuleString = - "define void @f(i32* %A, i64 %N) {\n" + "define void @f(ptr %A, i64 %N) {\n" "entry:\n" " br label %for.body\n" "for.body:\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %arr.idx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv\n" - " %l1 = load i32, i32* %arr.idx, align 4\n" + " %arr.idx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\n" + " %l1 = load i32, ptr %arr.idx, align 4\n" " %res = add i32 %l1, 10\n" - " store i32 %res, i32* %arr.idx, align 4\n" + " store i32 %res, ptr %arr.idx, align 4\n" " %indvars.iv.next = add i64 %indvars.iv, 1\n" " %exitcond = icmp ne i64 %indvars.iv.next, %N\n" " br i1 %exitcond, label %for.body, label %for.end\n" diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index 70951f3a656a0..396919763c933 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -56,33 +56,33 @@ TEST_F(VPlanSlpTest, testSlpSimple_2) { "%struct.Test = type { i32, i32 }\n" "%struct.Test3 = type { i32, i32, i32 }\n" "%struct.Test4xi8 = type { i8, i8, i8 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %add0 = add nsw i32 %vA0, %vB0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %add1 = add nsw i32 %vA1, %vB1\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -129,33 +129,33 @@ TEST_F(VPlanSlpTest, testSlpSimple_3) { "%struct.Test = type { i32, i32 }\n" "%struct.Test3 = type { i32, i32, i32 }\n" "%struct.Test4xi8 = type { i8, i8, i8 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr %struct.Test, ptr %A, i64 " " %indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " " %indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %add0 = add nsw i32 %vA0, %vB0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " " %indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " " %indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %add1 = add nsw i32 %vA1, %vB1\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " " %indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " " %indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -206,27 +206,27 @@ TEST_F(VPlanSlpTest, testSlpSimple_3) { TEST_F(VPlanSlpTest, testSlpReuse_1) { const char *ModuleString = "%struct.Test = type { i32, i32 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" + " %vA0 = load i32, ptr %A0, align 4\n" " %add0 = add nsw i32 %vA0, %vA0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" + " %vA1 = load i32, ptr %A1, align 4\n" " %add1 = add nsw i32 %vA1, %vA1\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -269,27 +269,27 @@ TEST_F(VPlanSlpTest, testSlpReuse_1) { TEST_F(VPlanSlpTest, testSlpReuse_2) { const char *ModuleString = "%struct.Test = type { i32, i32 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" + " %vA0 = load i32, ptr %A0, align 4\n" " %add0 = add nsw i32 %vA0, %vA0\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" + " %vA1 = load i32, ptr %A1, align 4\n" " %add1 = add nsw i32 %vA1, %vA1\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %use = add i32 %vA1, 1\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" @@ -386,49 +386,49 @@ TEST_F(VPlanSlpTest, testSlpReorder_1) { LLVMContext Ctx; const char *ModuleString = "%struct.Test = type { i32, i32 }\n" - "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " - "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "define void @add_x3(ptr %A, ptr %B, ptr " + "%C, ptr %D, ptr %E) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %mul11 = mul nsw i32 %vA0, %vB0\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " %vC0 = load i32, i32* %C0, align 4\n" - " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC0 = load i32, ptr %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 0\n" - " %vD0 = load i32, i32* %D0, align 4\n" + " %vD0 = load i32, ptr %D0, align 4\n" " %mul12 = mul nsw i32 %vC0, %vD0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %mul21 = mul nsw i32 %vA1, %vB1\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " %vC1 = load i32, i32* %C1, align 4\n" - " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC1 = load i32, ptr %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 1\n" - " %vD1 = load i32, i32* %D1, align 4\n" + " %vD1 = load i32, ptr %D1, align 4\n" " %mul22 = mul nsw i32 %vC1, %vD1\n" " %add1 = add nsw i32 %mul11, %mul12\n" " %add2 = add nsw i32 %mul22, %mul21\n" - " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " %E0 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 0\n" - " store i32 %add1, i32* %E0, align 4\n" - " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " store i32 %add1, ptr %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 1\n" - " store i32 %add2, i32* %E1, align 4\n" + " store i32 %add2, ptr %E1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -458,49 +458,49 @@ TEST_F(VPlanSlpTest, testSlpReorder_2) { LLVMContext Ctx; const char *ModuleString = "%struct.Test = type { i32, i32 }\n" - "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " - "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "define void @add_x3(ptr %A, ptr %B, ptr " + "%C, ptr %D, ptr %E) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %mul11 = mul nsw i32 %vA0, %vB0\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " %vC0 = load i32, i32* %C0, align 4\n" - " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC0 = load i32, ptr %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 0\n" - " %vD0 = load i32, i32* %D0, align 4\n" + " %vD0 = load i32, ptr %D0, align 4\n" " %mul12 = mul nsw i32 %vC0, %vD0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %mul21 = mul nsw i32 %vB1, %vA1\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " %vC1 = load i32, i32* %C1, align 4\n" - " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC1 = load i32, ptr %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 1\n" - " %vD1 = load i32, i32* %D1, align 4\n" + " %vD1 = load i32, ptr %D1, align 4\n" " %mul22 = mul nsw i32 %vD1, %vC1\n" " %add1 = add nsw i32 %mul11, %mul12\n" " %add2 = add nsw i32 %mul22, %mul21\n" - " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " %E0 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 0\n" - " store i32 %add1, i32* %E0, align 4\n" - " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " store i32 %add1, ptr %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 1\n" - " store i32 %add2, i32* %E1, align 4\n" + " store i32 %add2, ptr %E1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -530,49 +530,49 @@ TEST_F(VPlanSlpTest, testSlpReorder_3) { LLVMContext Ctx; const char *ModuleString = "%struct.Test = type { i32, i32 }\n" - "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " - "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "define void @add_x3(ptr %A, ptr %B, ptr " + "%C, ptr %D, ptr %E) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %mul11 = mul nsw i32 %vA1, %vB0\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " %vC0 = load i32, i32* %C0, align 4\n" - " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC0 = load i32, ptr %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 0\n" - " %vD0 = load i32, i32* %D0, align 4\n" + " %vD0 = load i32, ptr %D0, align 4\n" " %mul12 = mul nsw i32 %vC0, %vD0\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %mul21 = mul nsw i32 %vB1, %vA0\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " %vC1 = load i32, i32* %C1, align 4\n" - " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC1 = load i32, ptr %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 1\n" - " %vD1 = load i32, i32* %D1, align 4\n" + " %vD1 = load i32, ptr %D1, align 4\n" " %mul22 = mul nsw i32 %vD1, %vC1\n" " %add1 = add nsw i32 %mul11, %mul12\n" " %add2 = add nsw i32 %mul22, %mul21\n" - " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " %E0 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 0\n" - " store i32 %add1, i32* %E0, align 4\n" - " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " store i32 %add1, ptr %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 1\n" - " store i32 %add2, i32* %E1, align 4\n" + " store i32 %add2, ptr %E1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -606,49 +606,49 @@ TEST_F(VPlanSlpTest, testSlpReorder_4) { LLVMContext Ctx; const char *ModuleString = "%struct.Test = type { i32, i32 }\n" - "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " - "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "define void @add_x3(ptr %A, ptr %B, ptr " + "%C, ptr %D, ptr %E) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %mul11 = mul nsw i32 %vA0, %vB0\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " %vC0 = load i32, i32* %C0, align 4\n" - " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC0 = load i32, ptr %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 0\n" - " %vD0 = load i32, i32* %D0, align 4\n" + " %vD0 = load i32, ptr %D0, align 4\n" " %mul12 = mul nsw i32 %vC0, %vD0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %mul21 = mul nsw i32 %vA1, %vB1\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " %vC1 = load i32, i32* %C1, align 4\n" - " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + " %vC1 = load i32, ptr %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, ptr %D, i64 " "%indvars.iv, i32 1\n" - " %vD1 = load i32, i32* %D1, align 4\n" + " %vD1 = load i32, ptr %D1, align 4\n" " %mul22 = mul nsw i32 %vC1, %vD1\n" " %add1 = add nsw i32 %mul11, %mul12\n" " %add2 = add nsw i32 %mul22, %mul21\n" - " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " %E0 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 0\n" - " store i32 %add1, i32* %E0, align 4\n" - " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + " store i32 %add1, ptr %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, ptr %E, i64 " "%indvars.iv, i32 1\n" - " store i32 %add2, i32* %E1, align 4\n" + " store i32 %add2, ptr %E1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -680,35 +680,35 @@ TEST_F(VPlanSlpTest, testInstrsInDifferentBBs) { "%struct.Test = type { i32, i32 }\n" "%struct.Test3 = type { i32, i32, i32 }\n" "%struct.Test4xi8 = type { i8, i8, i8 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %bb2 ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %add0 = add nsw i32 %vA0, %vB0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" " br label %bb2\n" "bb2:\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %add1 = add nsw i32 %vA1, %vB1\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -743,35 +743,35 @@ TEST_F(VPlanSlpTest, testInstrsInDifferentBBs2) { "%struct.Test = type { i32, i32 }\n" "%struct.Test3 = type { i32, i32, i32 }\n" "%struct.Test4xi8 = type { i8, i8, i8 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %bb2 ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %add0 = add nsw i32 %vA0, %vB0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %add1 = add nsw i32 %vA1, %vB1\n" " br label %bb2\n" "bb2:\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -805,33 +805,33 @@ TEST_F(VPlanSlpTest, testSlpAtomicLoad) { "%struct.Test = type { i32, i32 }\n" "%struct.Test3 = type { i32, i32, i32 }\n" "%struct.Test4xi8 = type { i8, i8, i8 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load atomic i32, i32* %A0 monotonic, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load atomic i32, ptr %A0 monotonic, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %add0 = add nsw i32 %vA0, %vB0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %add1 = add nsw i32 %vA1, %vB1\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store i32 %add0, i32* %C0, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store i32 %add0, ptr %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" @@ -864,33 +864,33 @@ TEST_F(VPlanSlpTest, testSlpAtomicStore) { "%struct.Test = type { i32, i32 }\n" "%struct.Test3 = type { i32, i32, i32 }\n" "%struct.Test4xi8 = type { i8, i8, i8 }\n" - "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " - "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "define void @add_x2(ptr nocapture readonly %A, ptr " + "nocapture readonly %B, ptr nocapture %C) {\n" "entry:\n" " br label %for.body\n" "for.body: ; preds = %for.body, " "%entry\n" " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" - " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A0 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 0\n" - " %vA0 = load i32, i32* %A0, align 4\n" - " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA0 = load i32, ptr %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 0\n" - " %vB0 = load i32, i32* %B0, align 4\n" + " %vB0 = load i32, ptr %B0, align 4\n" " %add0 = add nsw i32 %vA0, %vB0\n" - " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %A1 = getelementptr inbounds %struct.Test, ptr %A, i64 " "%indvars.iv, i32 1\n" - " %vA1 = load i32, i32* %A1, align 4\n" - " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %vA1 = load i32, ptr %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, ptr %B, i64 " "%indvars.iv, i32 1\n" - " %vB1 = load i32, i32* %B1, align 4\n" + " %vB1 = load i32, ptr %B1, align 4\n" " %add1 = add nsw i32 %vA1, %vB1\n" - " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %C0 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 0\n" - " store atomic i32 %add0, i32* %C0 monotonic, align 4\n" - " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " store atomic i32 %add0, ptr %C0 monotonic, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, ptr %C, i64 " "%indvars.iv, i32 1\n" - " store i32 %add1, i32* %C1, align 4\n" + " store i32 %add1, ptr %C1, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" From db9811cd131d66c1c3dff0222fc8bcc83a555846 Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 21 Feb 2024 11:53:01 -0800 Subject: [PATCH 143/351] Make lldbDataFormatters.py compatible with Python 3.8 (#82518) I just tried to load this into LLDB built against Python 3.8.5 and got the following error: `TypeError: 'type' object is not subscriptable`. I could fix this by wrapping the annotations in quotes but since Python 3.7 this syntax can be enabled with `from __future__ import annotations`. --- llvm/utils/lldbDataFormatters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py index de101abdabc8e..4a34ad2a87c33 100644 --- a/llvm/utils/lldbDataFormatters.py +++ b/llvm/utils/lldbDataFormatters.py @@ -3,6 +3,7 @@ Load into LLDB with 'command script import /path/to/lldbDataFormatters.py' """ +from __future__ import annotations import collections import lldb From baf6bd303bd58a521809d456dd9b179636982fc5 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 21 Feb 2024 20:53:44 +0100 Subject: [PATCH 144/351] [Clang] Fixes to immediate-escalating functions (#82281) * Consider that immediate escalating function can appear at global scope, fixing a crash * Lambda conversion to function pointer was sometimes not performed in an immediate function context when it should be. Fixes #82258 --- clang/docs/ReleaseNotes.rst | 4 +++ clang/include/clang/Sema/Sema.h | 4 ++- clang/lib/Sema/SemaExpr.cpp | 4 +-- .../SemaCXX/cxx2b-consteval-propagate.cpp | 26 +++++++++++++++++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 15905e0895509..dd217e16f1f1a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -297,6 +297,10 @@ Bug Fixes to C++ Support was only accepted at namespace scope but not at local function scope. - Clang no longer tries to call consteval constructors at runtime when they appear in a member initializer. (`#782154 `_`) +- Fix crash when using an immediate-escalated function at global scope. + (`#82258 `_) +- Correctly immediate-escalate lambda conversion functions. + (`#82258 `_) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 89215bf3d1c69..fcccac10f4733 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1158,7 +1158,9 @@ class Sema final { if (FD) { FD->setWillHaveBody(true); S.ExprEvalContexts.back().InImmediateFunctionContext = - FD->isImmediateFunction(); + FD->isImmediateFunction() || + S.ExprEvalContexts[S.ExprEvalContexts.size() - 2] + .isConstantEvaluated(); S.ExprEvalContexts.back().InImmediateEscalatingFunctionContext = S.getLangOpts().CPlusPlus20 && FD->isImmediateEscalating(); } else diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 37a7db889a6ea..816ee9e281359 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -18311,7 +18311,6 @@ void Sema::CheckUnusedVolatileAssignment(Expr *E) { } void Sema::MarkExpressionAsImmediateEscalating(Expr *E) { - assert(!FunctionScopes.empty() && "Expected a function scope"); assert(getLangOpts().CPlusPlus20 && ExprEvalContexts.back().InImmediateEscalatingFunctionContext && "Cannot mark an immediate escalating expression outside of an " @@ -18328,7 +18327,8 @@ void Sema::MarkExpressionAsImmediateEscalating(Expr *E) { } else { assert(false && "expected an immediately escalating expression"); } - getCurFunction()->FoundImmediateEscalatingExpression = true; + if (FunctionScopeInfo *FI = getCurFunction()) + FI->FoundImmediateEscalatingExpression = true; } ExprResult Sema::CheckForImmediateInvocation(ExprResult E, FunctionDecl *Decl) { diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp index 531a626228733..4a75392045d05 100644 --- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp +++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp @@ -368,3 +368,29 @@ vector v{}; // expected-note@-2 {{in call to 'vector()'}} } + + +namespace GH82258 { + +template +constexpr auto none_of(R&& r, Pred pred) -> bool { return true; } + +struct info { int value; }; +consteval auto is_invalid(info i) -> bool { return false; } +constexpr info types[] = { {1}, {3}, {5}}; + +static_assert(none_of( + types, + +[](info i) consteval { + return is_invalid(i); + } +)); + +static_assert(none_of( + types, + []{ + return is_invalid; + }() +)); + +} From 5daf2001a1e4d71ce1273a1e7e31cf6e6ac37c10 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 21 Feb 2024 11:54:34 -0800 Subject: [PATCH 145/351] [BOLT] Fix memory leak in BinarySection (#82520) The change in #80950 exposed a memory leak in BinarySection. Let BinarySection manage memory passed via updateContents() unless a valid SectionID is set indicating that the contents are managed by JITLink. --- bolt/include/bolt/Core/BinarySection.h | 20 +++++++++++++------- bolt/lib/Core/BinarySection.cpp | 13 +------------ bolt/lib/Rewrite/RewriteInstance.cpp | 9 +++------ bolt/unittests/Core/BinaryContext.cpp | 9 +++++---- 4 files changed, 22 insertions(+), 29 deletions(-) diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h index a85dbf28950e3..0f179877bd3df 100644 --- a/bolt/include/bolt/Core/BinarySection.h +++ b/bolt/include/bolt/Core/BinarySection.h @@ -139,10 +139,7 @@ class BinarySection { Alignment = NewAlignment; ELFType = NewELFType; ELFFlags = NewELFFlags; - OutputSize = NewSize; - OutputContents = StringRef(reinterpret_cast(NewData), - NewData ? NewSize : 0); - IsFinalized = true; + updateContents(NewData, NewSize); } public: @@ -484,9 +481,18 @@ class BinarySection { void flushPendingRelocations(raw_pwrite_stream &OS, SymbolResolverFuncTy Resolver); - /// Change contents of the section. - void updateContents(const uint8_t *Data, size_t NewSize) { - OutputContents = StringRef(reinterpret_cast(Data), NewSize); + /// Change contents of the section. Unless the section has a valid SectionID, + /// the memory passed in \p NewData will be managed by the instance of + /// BinarySection. + void updateContents(const uint8_t *NewData, size_t NewSize) { + if (getOutputData() && !hasValidSectionID() && + (!hasSectionRef() || + OutputContents.data() != getContentsOrQuit(Section).data())) { + delete[] getOutputData(); + } + + OutputContents = StringRef(reinterpret_cast(NewData), + NewData ? NewSize : 0); OutputSize = NewSize; IsFinalized = true; } diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp index 564c63e81914c..9ad49ca1b3a03 100644 --- a/bolt/lib/Core/BinarySection.cpp +++ b/bolt/lib/Core/BinarySection.cpp @@ -190,18 +190,7 @@ void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS, clearList(PendingRelocations); } -BinarySection::~BinarySection() { - if (isReordered()) { - delete[] getData(); - return; - } - - if (!isAllocatable() && !hasValidSectionID() && - (!hasSectionRef() || - OutputContents.data() != getContentsOrQuit(Section).data())) { - delete[] getOutputData(); - } -} +BinarySection::~BinarySection() { updateContents(nullptr, 0); } void BinarySection::clearRelocations() { clearList(Relocations); } diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 954c0fc86fa17..cde195c173907 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -4092,12 +4092,9 @@ void RewriteInstance::rewriteNoteSections() { return getNewValueForSymbol(S->getName()); }); - // Set/modify section info. - BinarySection &NewSection = BC->registerOrUpdateNoteSection( - SectionName, SectionData, Size, Section.sh_addralign, - !BSec->isWritable(), BSec->getELFType()); - NewSection.setOutputAddress(0); - NewSection.setOutputFileOffset(NextAvailableOffset); + // Section contents are no longer needed, but we need to update the size so + // that it will be reflected in the section header table. + BSec->updateContents(nullptr, Size); NextAvailableOffset += Size; } diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp index 1fbb07bca966a..94ee65e63a1dc 100644 --- a/bolt/unittests/Core/BinaryContext.cpp +++ b/bolt/unittests/Core/BinaryContext.cpp @@ -77,10 +77,11 @@ TEST_P(BinaryContextTester, FlushPendingRelocCALL26) { // 12: bl func2 // 16: func2 - char Data[20] = {}; + constexpr size_t DataSize = 20; + uint8_t *Data = new uint8_t[DataSize]; BinarySection &BS = BC->registerOrUpdateSection( - ".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, - (uint8_t *)Data, sizeof(Data), 4); + ".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, Data, + DataSize, 4); MCSymbol *RelSymbol1 = BC->getOrCreateGlobalSymbol(4, "Func1"); ASSERT_TRUE(RelSymbol1); BS.addRelocation(8, RelSymbol1, ELF::R_AARCH64_CALL26, 0, 0, true); @@ -89,7 +90,7 @@ TEST_P(BinaryContextTester, FlushPendingRelocCALL26) { BS.addRelocation(12, RelSymbol2, ELF::R_AARCH64_CALL26, 0, 0, true); std::error_code EC; - SmallVector Vect(sizeof(Data)); + SmallVector Vect(DataSize); raw_svector_ostream OS(Vect); BS.flushPendingRelocations(OS, [&](const MCSymbol *S) { From 7fa8585fdefd98dd73940c74165aa55da1175f02 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Wed, 21 Feb 2024 12:21:35 -0800 Subject: [PATCH 146/351] [NFC][clang] Remove trailing whitespaces --- clang/lib/InstallAPI/FileList.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/InstallAPI/FileList.cpp b/clang/lib/InstallAPI/FileList.cpp index baa524db5d7f8..8a01248659b7d 100644 --- a/clang/lib/InstallAPI/FileList.cpp +++ b/clang/lib/InstallAPI/FileList.cpp @@ -22,13 +22,13 @@ InstallAPI JSON Input Format specification. { "headers" : [ # Required: Key must exist. { # Optional: May contain 0 or more header inputs. - "path" : "/usr/include/mach-o/dlfn.h", # Required: Path should point to destination + "path" : "/usr/include/mach-o/dlfn.h", # Required: Path should point to destination # location where applicable. "type" : "public", # Required: Maps to HeaderType for header. "language": "c++" # Optional: Language mode for header. } ], - "version" : "3" # Required: Version 3 supports language mode + "version" : "3" # Required: Version 3 supports language mode & project header input. } */ From cb1fed3a89e0cdc2660edaada1f0868cae3b7bcf Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 20:04:20 +0000 Subject: [PATCH 147/351] [NVPTX] Correctly guard int -> bf16 on PTX version and SM version --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 6 ++++-- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index fc6c642acbc07..7d2fe78d14229 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -788,13 +788,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // sm_80 only has conversions between f32 and bf16. Custom lower all other // bf16 conversions. - if (STI.hasBF16Math() && - (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { + if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { setOperationAction( {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Custom); } + setOperationAction( + {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, + MVT::bf16, Custom); } setOperationAction(ISD::FROUND, MVT::f16, Promote); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 40d82ebecbed3..55a1955a7f497 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3247,23 +3247,23 @@ def : Pat<(f16 (uint_to_fp Int64Regs:$a)), // sint -> bf16 def : Pat<(bf16 (sint_to_fp Int1Regs:$a)), - (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; + (CVT_bf16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; def : Pat<(bf16 (sint_to_fp Int16Regs:$a)), - (CVT_bf16_s16 Int16Regs:$a, CvtRN)>; + (CVT_bf16_s16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; def : Pat<(bf16 (sint_to_fp Int32Regs:$a)), - (CVT_bf16_s32 Int32Regs:$a, CvtRN)>; + (CVT_bf16_s32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; def : Pat<(bf16 (sint_to_fp Int64Regs:$a)), - (CVT_bf16_s64 Int64Regs:$a, CvtRN)>; + (CVT_bf16_s64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; // uint -> bf16 def : Pat<(bf16 (uint_to_fp Int1Regs:$a)), - (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; + (CVT_bf16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; def : Pat<(bf16 (uint_to_fp Int16Regs:$a)), - (CVT_bf16_u16 Int16Regs:$a, CvtRN)>; + (CVT_bf16_u16 Int16Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; def : Pat<(bf16 (uint_to_fp Int32Regs:$a)), - (CVT_bf16_u32 Int32Regs:$a, CvtRN)>; + (CVT_bf16_u32 Int32Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; def : Pat<(bf16 (uint_to_fp Int64Regs:$a)), - (CVT_bf16_u64 Int64Regs:$a, CvtRN)>; + (CVT_bf16_u64 Int64Regs:$a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; // sint -> f32 def : Pat<(f32 (sint_to_fp Int1Regs:$a)), From d4fd20258f63d30be638b04f10eaa469707759f0 Mon Sep 17 00:00:00 2001 From: mlevesquedion Date: Wed, 21 Feb 2024 12:28:05 -0800 Subject: [PATCH 148/351] [mlir] Use arith max or min ops instead of cmp + select (#82178) I believe the semantics should be the same, but this saves 1 op and simplifies the code. For example, the following two instructions: ``` %2 = cmp sgt %0, %1 %3 = select %2, %0, %1 ``` Are equivalent to: ``` %2 = maxsi %0 %1 ``` --- .../AffineToStandard/AffineToStandard.cpp | 16 ++-- .../ShapeToStandard/ShapeToStandard.cpp | 8 +- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 20 ++--- .../TosaToLinalg/TosaToLinalgNamed.cpp | 9 +- mlir/lib/Dialect/SCF/Utils/Utils.cpp | 6 +- .../Dialect/Tosa/Utils/ConversionUtils.cpp | 9 +- .../AffineToStandard/lower-affine.mlir | 42 ++++------ .../expand-then-convert-to-llvm.mlir | 3 +- .../ShapeToStandard/shape-to-standard.mlir | 18 ++-- .../TosaToLinalg/tosa-to-linalg-named.mlir | 44 ++++------ .../TosaToLinalg/tosa-to-linalg-resize.mlir | 83 +++++++------------ .../TosaToLinalg/tosa-to-linalg.mlir | 61 +++++--------- mlir/test/Transforms/parametric-tiling.mlir | 12 +-- 13 files changed, 113 insertions(+), 218 deletions(-) diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp index 15ad6d8cdf629..e69f9c837ca1d 100644 --- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp +++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp @@ -34,12 +34,7 @@ using namespace mlir::affine; using namespace mlir::vector; /// Given a range of values, emit the code that reduces them with "min" or "max" -/// depending on the provided comparison predicate. The predicate defines which -/// comparison to perform, "lt" for "min", "gt" for "max" and is used for the -/// `cmpi` operation followed by the `select` operation: -/// -/// %cond = arith.cmpi "predicate" %v0, %v1 -/// %result = select %cond, %v0, %v1 +/// depending on the provided comparison predicate, sgt for max and slt for min. /// /// Multiple values are scanned in a linear sequence. This creates a data /// dependences that wouldn't exist in a tree reduction, but is easier to @@ -48,13 +43,16 @@ static Value buildMinMaxReductionSeq(Location loc, arith::CmpIPredicate predicate, ValueRange values, OpBuilder &builder) { assert(!values.empty() && "empty min/max chain"); + assert(predicate == arith::CmpIPredicate::sgt || + predicate == arith::CmpIPredicate::slt); auto valueIt = values.begin(); Value value = *valueIt++; for (; valueIt != values.end(); ++valueIt) { - auto cmpOp = builder.create(loc, predicate, value, *valueIt); - value = builder.create(loc, cmpOp.getResult(), value, - *valueIt); + if (predicate == arith::CmpIPredicate::sgt) + value = builder.create(loc, value, *valueIt); + else + value = builder.create(loc, value, *valueIt); } return value; diff --git a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp index a3e51aeed0735..de649f730ee9d 100644 --- a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp +++ b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp @@ -147,9 +147,7 @@ LogicalResult BroadcastOpConverter::matchAndRewrite( // Find the maximum rank Value maxRank = ranks.front(); for (Value v : llvm::drop_begin(ranks, 1)) { - Value rankIsGreater = - lb.create(arith::CmpIPredicate::ugt, v, maxRank); - maxRank = lb.create(rankIsGreater, v, maxRank); + maxRank = lb.create(v, maxRank); } // Calculate the difference of ranks and the maximum rank for later offsets. @@ -262,9 +260,7 @@ LogicalResult IsBroadcastableOpConverter::matchAndRewrite( // Find the maximum rank Value maxRank = ranks.front(); for (Value v : llvm::drop_begin(ranks, 1)) { - Value rankIsGreater = - lb.create(arith::CmpIPredicate::ugt, v, maxRank); - maxRank = lb.create(rankIsGreater, v, maxRank); + maxRank = lb.create(v, maxRank); } // Calculate the difference of ranks and the maximum rank for later offsets. diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index f4f6dadfb3716..7eb32ebe3228f 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -61,10 +61,8 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, if (isa(op) && isa(elementTy)) { auto zero = rewriter.create( loc, rewriter.getZeroAttr(elementTy)); - auto cmp = rewriter.create(loc, arith::CmpIPredicate::sgt, - args[0], zero); auto neg = rewriter.create(loc, zero, args[0]); - return rewriter.create(loc, cmp, args[0], neg); + return rewriter.create(loc, args[0], neg); } // tosa::AddOp @@ -348,9 +346,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, } if (isa(op) && elementTy.isSignlessInteger()) { - auto predicate = rewriter.create( - loc, arith::CmpIPredicate::sgt, args[0], args[1]); - return rewriter.create(loc, predicate, args[0], args[1]); + return rewriter.create(loc, args[0], args[1]); } // tosa::MinimumOp @@ -359,9 +355,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, } if (isa(op) && elementTy.isSignlessInteger()) { - auto predicate = rewriter.create( - loc, arith::CmpIPredicate::slt, args[0], args[1]); - return rewriter.create(loc, predicate, args[0], args[1]); + return rewriter.create(loc, args[0], args[1]); } // tosa::CeilOp @@ -1000,9 +994,7 @@ static Value createLinalgBodyCalculationForReduceOp(Operation *op, } if (isa(op) && isa(elementTy)) { - auto predicate = rewriter.create( - loc, arith::CmpIPredicate::slt, args[0], args[1]); - return rewriter.create(loc, predicate, args[0], args[1]); + return rewriter.create(loc, args[0], args[1]); } if (isa(op) && isa(elementTy)) { @@ -1010,9 +1002,7 @@ static Value createLinalgBodyCalculationForReduceOp(Operation *op, } if (isa(op) && isa(elementTy)) { - auto predicate = rewriter.create( - loc, arith::CmpIPredicate::sgt, args[0], args[1]); - return rewriter.create(loc, predicate, args[0], args[1]); + return rewriter.create(loc, args[0], args[1]); } if (isa(op) && elementTy.isInteger(1)) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp index 607a603cca810..3f39cbf03a9a8 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp @@ -845,10 +845,7 @@ class AvgPool2dConverter : public OpRewritePattern { auto padVal = rewriter.create(loc, pad); Value dpos = rewriter.create(loc, pos, padVal); - Value cmp = rewriter.create( - loc, arith::CmpIPredicate::slt, dpos, zero); - Value offset = - rewriter.create(loc, cmp, dpos, zero); + Value offset = rewriter.create(loc, dpos, zero); return rewriter.create(loc, valid, offset) ->getResult(0); }; @@ -868,9 +865,7 @@ class AvgPool2dConverter : public OpRewritePattern { // Determine how much padding was included. val = padFn(val, left, pad[i * 2]); val = padFn(val, right, pad[i * 2 + 1]); - Value cmp = rewriter.create( - loc, arith::CmpIPredicate::slt, val, one); - return rewriter.create(loc, cmp, one, val); + return rewriter.create(loc, one, val); }; // Compute the indices from either end. diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index 536c02feca1bd..502d7e197a6f6 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -791,10 +791,8 @@ static Loops stripmineSink(scf::ForOp forOp, Value factor, // Insert newForOp before the terminator of `t`. auto b = OpBuilder::atBlockTerminator((t.getBody())); Value stepped = b.create(t.getLoc(), iv, forOp.getStep()); - Value less = b.create(t.getLoc(), arith::CmpIPredicate::slt, - forOp.getUpperBound(), stepped); - Value ub = b.create(t.getLoc(), less, - forOp.getUpperBound(), stepped); + Value ub = + b.create(t.getLoc(), forOp.getUpperBound(), stepped); // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses. auto newForOp = b.create(t.getLoc(), iv, ub, originalStep); diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp index ee428b201d007..4fc97115064f3 100644 --- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp +++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp @@ -39,13 +39,8 @@ Value mlir::tosa::clampFloatHelper(Location loc, Value arg, Value min, Value mlir::tosa::clampIntHelper(Location loc, Value arg, Value min, Value max, OpBuilder &rewriter) { - auto smallerThanMin = - rewriter.create(loc, arith::CmpIPredicate::slt, arg, min); - auto minOrArg = - rewriter.create(loc, smallerThanMin, min, arg); - auto largerThanMax = - rewriter.create(loc, arith::CmpIPredicate::slt, max, arg); - return rewriter.create(loc, largerThanMax, max, minOrArg); + auto minOrArg = rewriter.create(loc, min, arg); + return rewriter.create(loc, max, minOrArg); } bool mlir::tosa::validIntegerRange(IntegerType ty, int64_t value) { diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir index 92608135d24b0..00d7b6b8d65f6 100644 --- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir +++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir @@ -371,16 +371,14 @@ func.func @if_for() { // CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index // CHECK-NEXT: for %{{.*}} = %[[c0]] to %[[c42]] step %[[c1]] { // CHECK-NEXT: %[[cm1:.*]] = arith.constant -1 : index -// CHECK-NEXT: %[[a:.*]] = arith.muli %{{.*}}, %[[cm1]] : index -// CHECK-NEXT: %[[b:.*]] = arith.addi %[[a]], %{{.*}} : index -// CHECK-NEXT: %[[c:.*]] = arith.cmpi sgt, %{{.*}}, %[[b]] : index -// CHECK-NEXT: %[[d:.*]] = arith.select %[[c]], %{{.*}}, %[[b]] : index +// CHECK-NEXT: %[[mul0:.*]] = arith.muli %{{.*}}, %[[cm1]] : index +// CHECK-NEXT: %[[add0:.*]] = arith.addi %[[mul0]], %{{.*}} : index +// CHECK-NEXT: %[[max:.*]] = arith.maxsi %{{.*}}, %[[add0]] : index // CHECK-NEXT: %[[c10:.*]] = arith.constant 10 : index -// CHECK-NEXT: %[[e:.*]] = arith.addi %{{.*}}, %[[c10]] : index -// CHECK-NEXT: %[[f:.*]] = arith.cmpi slt, %{{.*}}, %[[e]] : index -// CHECK-NEXT: %[[g:.*]] = arith.select %[[f]], %{{.*}}, %[[e]] : index +// CHECK-NEXT: %[[add1:.*]] = arith.addi %{{.*}}, %[[c10]] : index +// CHECK-NEXT: %[[min:.*]] = arith.minsi %{{.*}}, %[[add1]] : index // CHECK-NEXT: %[[c1_0:.*]] = arith.constant 1 : index -// CHECK-NEXT: for %{{.*}} = %[[d]] to %[[g]] step %[[c1_0]] { +// CHECK-NEXT: for %{{.*}} = %[[max]] to %[[min]] step %[[c1_0]] { // CHECK-NEXT: call @body2(%{{.*}}, %{{.*}}) : (index, index) -> () // CHECK-NEXT: } // CHECK-NEXT: } @@ -397,25 +395,19 @@ func.func @loop_min_max(%N : index) { #map_7_values = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> -// Check that the "min" (cmpi slt + select) reduction sequence is emitted +// Check that the "min" reduction sequence is emitted // correctly for an affine map with 7 results. // CHECK-LABEL: func @min_reduction_tree // CHECK-NEXT: %[[c0:.*]] = arith.constant 0 : index -// CHECK-NEXT: %[[c01:.+]] = arith.cmpi slt, %{{.*}}, %{{.*}} : index -// CHECK-NEXT: %[[r01:.+]] = arith.select %[[c01]], %{{.*}}, %{{.*}} : index -// CHECK-NEXT: %[[c012:.+]] = arith.cmpi slt, %[[r01]], %{{.*}} : index -// CHECK-NEXT: %[[r012:.+]] = arith.select %[[c012]], %[[r01]], %{{.*}} : index -// CHECK-NEXT: %[[c0123:.+]] = arith.cmpi slt, %[[r012]], %{{.*}} : index -// CHECK-NEXT: %[[r0123:.+]] = arith.select %[[c0123]], %[[r012]], %{{.*}} : index -// CHECK-NEXT: %[[c01234:.+]] = arith.cmpi slt, %[[r0123]], %{{.*}} : index -// CHECK-NEXT: %[[r01234:.+]] = arith.select %[[c01234]], %[[r0123]], %{{.*}} : index -// CHECK-NEXT: %[[c012345:.+]] = arith.cmpi slt, %[[r01234]], %{{.*}} : index -// CHECK-NEXT: %[[r012345:.+]] = arith.select %[[c012345]], %[[r01234]], %{{.*}} : index -// CHECK-NEXT: %[[c0123456:.+]] = arith.cmpi slt, %[[r012345]], %{{.*}} : index -// CHECK-NEXT: %[[r0123456:.+]] = arith.select %[[c0123456]], %[[r012345]], %{{.*}} : index +// CHECK-NEXT: %[[min:.+]] = arith.minsi %{{.*}}, %{{.*}} : index +// CHECK-NEXT: %[[min_0:.+]] = arith.minsi %[[min]], %{{.*}} : index +// CHECK-NEXT: %[[min_1:.+]] = arith.minsi %[[min_0]], %{{.*}} : index +// CHECK-NEXT: %[[min_2:.+]] = arith.minsi %[[min_1]], %{{.*}} : index +// CHECK-NEXT: %[[min_3:.+]] = arith.minsi %[[min_2]], %{{.*}} : index +// CHECK-NEXT: %[[min_4:.+]] = arith.minsi %[[min_3]], %{{.*}} : index // CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index -// CHECK-NEXT: for %{{.*}} = %[[c0]] to %[[r0123456]] step %[[c1]] { +// CHECK-NEXT: for %{{.*}} = %[[c0]] to %[[min_4]] step %[[c1]] { // CHECK-NEXT: call @body(%{{.*}}) : (index) -> () // CHECK-NEXT: } // CHECK-NEXT: return @@ -690,8 +682,7 @@ func.func @affine_min(%arg0: index, %arg1: index) -> index{ // CHECK: %[[Cm2:.*]] = arith.constant -1 // CHECK: %[[neg2:.*]] = arith.muli %[[ARG0]], %[[Cm2:.*]] // CHECK: %[[second:.*]] = arith.addi %[[ARG1]], %[[neg2]] - // CHECK: %[[cmp:.*]] = arith.cmpi slt, %[[first]], %[[second]] - // CHECK: arith.select %[[cmp]], %[[first]], %[[second]] + // CHECK: arith.minsi %[[first]], %[[second]] %0 = affine.min affine_map<(d0,d1) -> (d0 - d1, d1 - d0)>(%arg0, %arg1) return %0 : index } @@ -705,8 +696,7 @@ func.func @affine_max(%arg0: index, %arg1: index) -> index{ // CHECK: %[[Cm2:.*]] = arith.constant -1 // CHECK: %[[neg2:.*]] = arith.muli %[[ARG0]], %[[Cm2:.*]] // CHECK: %[[second:.*]] = arith.addi %[[ARG1]], %[[neg2]] - // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[first]], %[[second]] - // CHECK: arith.select %[[cmp]], %[[first]], %[[second]] + // CHECK: arith.maxsi %[[first]], %[[second]] %0 = affine.max affine_map<(d0,d1) -> (d0 - d1, d1 - d0)>(%arg0, %arg1) return %0 : index } diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir index eb45112b117c0..87d613986c7c3 100644 --- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir @@ -554,8 +554,7 @@ func.func @collapse_shape_dynamic(%arg0 : memref<1x2x?xf32>) -> memref<1x?xf32> // CHECK: %[[SIZE1_TO_IDX:.*]] = builtin.unrealized_conversion_cast %[[FINAL_SIZE1]] : i64 to index // CHECK: %[[FINAL_SIZE1:.*]] = builtin.unrealized_conversion_cast %[[SIZE1_TO_IDX]] : index to i64 // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : i64 -// CHECK: %[[IS_MIN_STRIDE1:.*]] = llvm.icmp "slt" %[[STRIDE1]], %[[C1]] : i64 -// CHECK: %[[MIN_STRIDE1:.*]] = llvm.select %[[IS_MIN_STRIDE1]], %[[STRIDE1]], %[[C1]] : i1, i64 +// CHECK: %[[MIN_STRIDE1:.*]] = llvm.intr.smin(%[[STRIDE1]], %[[C1]]) : (i64, i64) -> i64 // CHECK: %[[MIN_STRIDE1_TO_IDX:.*]] = builtin.unrealized_conversion_cast %[[MIN_STRIDE1]] : i64 to index // CHECK: %[[MIN_STRIDE1:.*]] = builtin.unrealized_conversion_cast %[[MIN_STRIDE1_TO_IDX]] : index to i64 // CHECK: %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> diff --git a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir index cb3af973daee2..3b73c513b7955 100644 --- a/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir +++ b/mlir/test/Conversion/ShapeToStandard/shape-to-standard.mlir @@ -377,10 +377,8 @@ func.func @try_is_broadcastable (%a : tensor<2xindex>, %b : tensor<3xindex>, %c // CHECK: %[[RANK0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<2xindex> // CHECK: %[[RANK1:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<3xindex> // CHECK: %[[RANK2:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<2xindex> -// CHECK: %[[CMP0:.*]] = arith.cmpi ugt, %[[RANK1]], %[[RANK0]] : index -// CHECK: %[[LARGER_DIM:.*]] = arith.select %[[CMP0]], %[[RANK1]], %[[RANK0]] : index -// CHECK: %[[CMP1:.*]] = arith.cmpi ugt, %[[RANK2]], %[[LARGER_DIM]] : index -// CHECK: %[[MAX_RANK:.*]] = arith.select %[[CMP1]], %[[RANK2]], %[[LARGER_DIM]] : index +// CHECK: %[[MAX0:.*]] = arith.maxui %[[RANK1]], %[[RANK0]] : index +// CHECK: %[[MAX_RANK:.*]] = arith.maxui %[[RANK2]], %[[MAX0]] : index // CHECK: %[[DIM_DIFF0:.*]] = arith.subi %[[MAX_RANK]], %[[RANK0]] : index // CHECK: %[[DIM_DIFF1:.*]] = arith.subi %[[MAX_RANK]], %[[RANK1]] : index // CHECK: %[[DIM_DIFF2:.*]] = arith.subi %[[MAX_RANK]], %[[RANK2]] : index @@ -467,10 +465,8 @@ func.func @broadcast(%a : tensor<2xindex>, %b : tensor<3xindex>, %c : tensor<2xi // CHECK: %[[RANK0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<2xindex> // CHECK: %[[RANK1:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<3xindex> // CHECK: %[[RANK2:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<2xindex> -// CHECK: %[[CMP0:.*]] = arith.cmpi ugt, %[[RANK1]], %[[RANK0]] : index -// CHECK: %[[LARGER_DIM:.*]] = arith.select %[[CMP0]], %[[RANK1]], %[[RANK0]] : index -// CHECK: %[[CMP1:.*]] = arith.cmpi ugt, %[[RANK2]], %[[LARGER_DIM]] : index -// CHECK: %[[MAX_RANK:.*]] = arith.select %[[CMP1]], %[[RANK2]], %[[LARGER_DIM]] : index +// CHECK: %[[MAX0:.*]] = arith.maxui %[[RANK1]], %[[RANK0]] : index +// CHECK: %[[MAX_RANK:.*]] = arith.maxui %[[RANK2]], %[[MAX0]] : index // CHECK: %[[DIM_DIFF0:.*]] = arith.subi %[[MAX_RANK]], %[[RANK0]] : index // CHECK: %[[DIM_DIFF1:.*]] = arith.subi %[[MAX_RANK]], %[[RANK1]] : index // CHECK: %[[DIM_DIFF2:.*]] = arith.subi %[[MAX_RANK]], %[[RANK2]] : index @@ -559,10 +555,8 @@ func.func @broadcast_3_shapes_different_extents(%a : tensor<2xindex>, // CHECK: %[[RANK0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<2xindex> // CHECK: %[[RANK1:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<3xindex> // CHECK: %[[RANK2:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<2xindex> -// CHECK: %[[CMP0:.*]] = arith.cmpi ugt, %[[RANK1]], %[[RANK0]] : index -// CHECK: %[[LARGER_DIM:.*]] = arith.select %[[CMP0]], %[[RANK1]], %[[RANK0]] : index -// CHECK: %[[CMP1:.*]] = arith.cmpi ugt, %[[RANK2]], %[[LARGER_DIM]] : index -// CHECK: %[[MAX_RANK:.*]] = arith.select %[[CMP1]], %[[RANK2]], %[[LARGER_DIM]] : index +// CHECK: %[[MAX0:.*]] = arith.maxui %[[RANK1]], %[[RANK0]] : index +// CHECK: %[[MAX_RANK:.*]] = arith.maxui %[[RANK2]], %[[MAX0]] : index // CHECK: %[[DIM_DIFF0:.*]] = arith.subi %[[MAX_RANK]], %[[RANK0]] : index // CHECK: %[[DIM_DIFF1:.*]] = arith.subi %[[MAX_RANK]], %[[RANK1]] : index // CHECK: %[[DIM_DIFF2:.*]] = arith.subi %[[MAX_RANK]], %[[RANK2]] : index diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir index 51ebcad079780..e64903671e599 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir @@ -263,16 +263,13 @@ func.func @avg_pool_f32(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) // CHECK: %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]] // CHECK: %[[PAD_START:.+]] = arith.constant 1 // CHECK: %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[START_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[START_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]] // CHECK: %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]] // CHECK: %[[PAD_END:.+]] = arith.constant 1 // CHECK: %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[END_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]] // CHECK: %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_OFFSET]], %[[ONE]] - // CHECK: %[[KHEIGHT:.+]] = arith.select %[[CMP]], %[[ONE]], %[[END_OFFSET]] + // CHECK: %[[KHEIGHT:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]] // Compute how much of the width does not include padding: // CHECK: %[[STRIDE:.+]] = arith.constant 1 @@ -283,16 +280,13 @@ func.func @avg_pool_f32(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) // CHECK: %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]] // CHECK: %[[PAD_START:.+]] = arith.constant 1 // CHECK: %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[START_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[START_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]] // CHECK: %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]] // CHECK: %[[PAD_END:.+]] = arith.constant 1 // CHECK: %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[END_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]] // CHECK: %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_OFFSET]], %[[ONE]] - // CHECK: %[[KWIDTH:.+]] = arith.select %[[CMP]], %[[ONE]], %[[END_OFFSET]] + // CHECK: %[[KWIDTH:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]] // Divide the summed value by the number of values summed. // CHECK: %[[COUNT:.+]] = arith.muli %[[KHEIGHT]], %[[KWIDTH]] @@ -353,16 +347,13 @@ func.func @avg_pool_f16_f32acc(%arg0: tensor<1x6x34x62xf16>) -> (tensor<1x5x33x6 // CHECK: %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]] // CHECK: %[[PAD_START:.+]] = arith.constant 1 // CHECK: %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[START_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[START_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]] // CHECK: %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]] // CHECK: %[[PAD_END:.+]] = arith.constant 1 // CHECK: %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[END_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]] // CHECK: %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_OFFSET]], %[[ONE]] - // CHECK: %[[KHEIGHT:.+]] = arith.select %[[CMP]], %[[ONE]], %[[END_OFFSET]] + // CHECK: %[[KHEIGHT:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]] // Compute how much of the width does not include padding: // CHECK: %[[STRIDE:.+]] = arith.constant 1 @@ -373,16 +364,13 @@ func.func @avg_pool_f16_f32acc(%arg0: tensor<1x6x34x62xf16>) -> (tensor<1x5x33x6 // CHECK: %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]] // CHECK: %[[PAD_START:.+]] = arith.constant 1 // CHECK: %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[START_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[START_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]] // CHECK: %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]] // CHECK: %[[PAD_END:.+]] = arith.constant 1 // CHECK: %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_SUB]], %[[ZERO]] - // CHECK: %[[OFFSET:.+]] = arith.select %[[CMP]], %[[END_SUB]], %[[ZERO]] + // CHECK: %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]] // CHECK: %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[END_OFFSET]], %[[ONE]] - // CHECK: %[[KWIDTH:.+]] = arith.select %[[CMP]], %[[ONE]], %[[END_OFFSET]] + // CHECK: %[[KWIDTH:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]] // Divide the summed value by the number of values summed. // CHECK: %[[COUNT:.+]] = arith.muli %[[KHEIGHT]], %[[KWIDTH]] @@ -407,7 +395,7 @@ func.func @avg_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> (tensor<1x5x33x62xi8>) { // Only different behavior is how the division is performed. // First we compute the mul and shift values for average pool: - // CHECK: %[[COUNT:.+]] = arith.muli %21, %35 + // CHECK: %[[COUNT:.+]] = arith.muli %{{[0-9]+}}, %{{[0-9]+}} // CHECK: %[[ICAST:.+]] = arith.index_cast %[[COUNT]] // CHECK: %[[C1:.+]] = arith.constant 1 // CHECK: %[[C32:.+]] = arith.constant 32 @@ -428,10 +416,8 @@ func.func @avg_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> (tensor<1x5x33x62xi8>) { // Perform the normalization. // CHECK: %[[CMIN:.+]] = arith.constant -128 // CHECK: %[[CMAX:.+]] = arith.constant 127 - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[SCALED]], %[[CMIN]] - // CHECK: %[[SEL:.+]] = arith.select %[[CMP]], %[[CMIN]], %[[SCALED]] - // CHECK: %[[CMP:.+]] = arith.cmpi slt, %[[CMAX]], %[[SCALED]] - // CHECK: %[[CLAMP:.+]] = arith.select %[[CMP]], %[[CMAX]], %[[SEL]] + // CHECK: %[[LOW:.+]] = arith.maxsi %[[CMIN]], %[[SCALED]] + // CHECK: %[[CLAMP:.+]] = arith.minsi %[[CMAX]], %[[LOW]] // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLAMP]] // CHECK: linalg.yield %[[TRUNC]] %0 = tosa.avg_pool2d %arg0 {acc_type = i32, pad = array, kernel = array, stride = array} : (tensor<1x6x34x62xi8>) -> tensor<1x5x33x62xi8> diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir index aedc6b7fae4a4..468e92e2a2661 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir @@ -167,22 +167,18 @@ func.func @resize_nearest_int(%arg0: tensor<1x15x13x1xi8>) -> () { // CHECK: %[[PRED_Y:.*]] = arith.cmpi sge, %[[D_Y_DOUBLE]], %[[SCALE_Y_N]] // CHECK: %[[VAL_37:.*]] = arith.select %[[PRED_Y]], %[[ONE]], %[[ZERO]] // CHECK: %[[VAL_39:.*]] = arith.addi %[[I_Y]], %[[VAL_37]] - // CHECK: %[[VAL_41:.*]] = arith.cmpi slt, %[[VAL_39]], %[[ZERO]] - // CHECK: %[[VAL_42:.*]] = arith.select %[[VAL_41]], %[[ZERO]], %[[VAL_39]] - // CHECK: %[[VAL_43:.*]] = arith.cmpi slt, %[[Y_MAX]], %[[VAL_39]] - // CHECK: %[[VAL_44:.*]] = arith.select %[[VAL_43]], %[[Y_MAX]], %[[VAL_42]] - // CHECK: %[[IDY:.+]] = arith.index_cast %[[VAL_44]] + // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_39]] + // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[Y_MAX]], %[[LOWER]] + // CHECK: %[[IDY:.+]] = arith.index_cast %[[CLAMPED]] // Compute the offset and bound for the X position. // CHECK: %[[D_X_DOUBLE:.*]] = arith.shli %[[D_X]], %[[ONE]] // CHECK: %[[PRED_X:.*]] = arith.cmpi sge, %[[D_X_DOUBLE]], %[[SCALE_X_N]] // CHECK: %[[VAL_38:.*]] = arith.select %[[PRED_X]], %[[ONE]], %[[ZERO]] // CHECK: %[[VAL_40:.*]] = arith.addi %[[I_X]], %[[VAL_38]] - // CHECK: %[[VAL_45:.*]] = arith.cmpi slt, %[[VAL_40]], %[[ZERO]] - // CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_45]], %[[ZERO]], %[[VAL_40]] - // CHECK: %[[VAL_47:.*]] = arith.cmpi slt, %[[X_MAX]], %[[VAL_40]] - // CHECK: %[[VAL_48:.*]] = arith.select %[[VAL_47]], %[[X_MAX]], %[[VAL_46]] - // CHECK: %[[IDX:.+]] = arith.index_cast %[[VAL_48]] + // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_40]] + // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[X_MAX]], %[[LOWER]] + // CHECK: %[[IDX:.+]] = arith.index_cast %[[CLAMPED]] // CHECK: %[[EXTRACT:.+]] = tensor.extract %arg0[%[[IDX_0]], %[[IDY]], %[[IDX]], %[[IDX_3]]] // CHECK: linalg.yield %[[EXTRACT]] @@ -236,29 +232,21 @@ func.func @resize_bilinear_int(%arg0: tensor<1x19x20x1xi8>) { // Bound check each dimension. - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[I_Y]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[I_Y]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[Y_MAX]], %[[I_Y]] - // CHECK: %[[YLO:.*]] = arith.select %[[PRED]], %[[Y_MAX]], %[[BOUND]] + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[I_Y]] + // CHECK: %[[YLO:.*]] = arith.minsi %[[Y_MAX]], %[[BOUND]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[Y1]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[Y1]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[Y_MAX]], %[[Y1]] - // CHECK: %[[YHI:.*]] = arith.select %[[PRED]], %[[Y_MAX]], %[[BOUND]] + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[Y1]] + // CHECK: %[[YHI:.*]] = arith.minsi %[[Y_MAX]], %[[BOUND]] // CHECK: %[[YLOI:.+]] = arith.index_cast %[[YLO]] // CHECK: %[[YHII:.+]] = arith.index_cast %[[YHI]] // CHECK: %[[X1:.*]] = arith.addi %[[I_X]], %[[ONE]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[I_X]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[I_X]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[X_MAX]], %[[I_X]] - // CHECK: %[[XLO:.*]] = arith.select %[[PRED]], %[[X_MAX]], %[[BOUND]] + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[I_X]] + // CHECK: %[[XLO:.*]] = arith.minsi %[[X_MAX]], %[[BOUND]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[X1]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[X1]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[X_MAX]], %[[X1]] - // CHECK: %[[XHI:.*]] = arith.select %[[PRED]], %[[X_MAX]], %[[BOUND]] + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[X1]] + // CHECK: %[[XHI:.*]] = arith.minsi %[[X_MAX]], %[[BOUND]] // CHECK: %[[XLOI:.+]] = arith.index_cast %[[XLO]] // CHECK: %[[XHII:.+]] = arith.index_cast %[[XHI]] @@ -352,21 +340,17 @@ func.func @resize_nearest_fp32(%input: tensor<1x50x48x1xf32>) -> () { // CHECK: %[[PRED_Y:.*]] = arith.cmpf oge, %[[D_Y]], %[[HALF]] // CHECK: %[[ROUND_Y:.*]] = arith.select %[[PRED_Y]], %[[ONE]], %[[ZERO]] // CHECK: %[[VAL_48:.*]] = arith.addi %[[VAL_39]], %[[ROUND_Y]] - // CHECK: %[[VAL_50:.*]] = arith.cmpi slt, %[[VAL_48]], %[[ZERO]] - // CHECK: %[[VAL_51:.*]] = arith.select %[[VAL_50]], %[[ZERO]], %[[VAL_48]] - // CHECK: %[[VAL_52:.*]] = arith.cmpi slt, %[[YMAX]], %[[VAL_48]] - // CHECK: %[[VAL_53:.*]] = arith.select %[[VAL_52]], %[[YMAX]], %[[VAL_51]] - // CHECK: %[[IDY:.*]] = arith.index_cast %[[VAL_53]] + // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_48]] + // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[YMAX]], %[[LOWER]] + // CHECK: %[[IDY:.*]] = arith.index_cast %[[CLAMPED]] // CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 // CHECK: %[[PRED_X:.*]] = arith.cmpf oge, %[[D_X]], %[[HALF]] // CHECK: %[[ROUND_X:.*]] = arith.select %[[PRED_X]], %[[ONE]], %[[ZERO]] // CHECK: %[[VAL_49:.*]] = arith.addi %[[VAL_40]], %[[ROUND_X]] - // CHECK: %[[VAL_54:.*]] = arith.cmpi slt, %[[VAL_49]], %[[ZERO]] - // CHECK: %[[VAL_55:.*]] = arith.select %[[VAL_54]], %[[ZERO]], %[[VAL_49]] - // CHECK: %[[VAL_56:.*]] = arith.cmpi slt, %[[XMAX]], %[[VAL_49]] - // CHECK: %[[VAL_57:.*]] = arith.select %[[VAL_56]], %[[XMAX]], %[[VAL_55]] - // CHECK: %[[IDX:.*]] = arith.index_cast %[[VAL_57]] + // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_49]] + // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[XMAX]], %[[LOWER]] + // CHECK: %[[IDX:.*]] = arith.index_cast %[[CLAMPED]] // CHECK: %[[EXTRACT:.+]] = tensor.extract %arg0[%[[IDX0]], %[[IDY]], %[[IDX]], %[[IDX3]]] // CHECK: linalg.yield %[[EXTRACT]] @@ -429,28 +413,21 @@ func.func @resize_bilinear_fp(%input: tensor<1x23x24x1xf32>) -> () { // CHECK: %[[Y1:.*]] = arith.addi %[[I_Y]], %[[ONE]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[I_Y]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[I_Y]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[Y_MAX]], %[[I_Y]] - // CHECK: %[[YLO:.*]] = arith.select %[[PRED]], %[[Y_MAX]], %[[BOUND]] + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[I_Y]] + // CHECK: %[[YLO:.*]] = arith.minsi %[[Y_MAX]], %[[BOUND]] + + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[Y1]] + // CHECK: %[[YHI:.*]] = arith.minsi %[[Y_MAX]], %[[BOUND]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[Y1]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[Y1]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[Y_MAX]], %[[Y1]] - // CHECK: %[[YHI:.*]] = arith.select %[[PRED]], %[[Y_MAX]], %[[BOUND]] // CHECK: %[[YLOI:.+]] = arith.index_cast %[[YLO]] // CHECK: %[[YHII:.+]] = arith.index_cast %[[YHI]] // CHECK: %[[X1:.*]] = arith.addi %[[I_X]], %[[ONE]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[I_X]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[I_X]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[X_MAX]], %[[I_X]] - // CHECK: %[[XLO:.*]] = arith.select %[[PRED]], %[[X_MAX]], %[[BOUND]] - - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[X1]], %[[ZERO]] - // CHECK: %[[BOUND:.*]] = arith.select %[[PRED]], %[[ZERO]], %[[X1]] - // CHECK: %[[PRED:.*]] = arith.cmpi slt, %[[X_MAX]], %[[X1]] - // CHECK: %[[XHI:.*]] = arith.select %[[PRED]], %[[X_MAX]], %[[BOUND]] + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[I_X]] + // CHECK: %[[XLO:.*]] = arith.minsi %[[X_MAX]], %[[BOUND]] + + // CHECK: %[[BOUND:.*]] = arith.maxsi %[[ZERO]], %[[X1]] + // CHECK: %[[XHI:.*]] = arith.minsi %[[X_MAX]], %[[BOUND]] // CHECK: %[[XLOI:.+]] = arith.index_cast %[[XLO]] // CHECK: %[[XHII:.+]] = arith.index_cast %[[XHI]] diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index fc22a436526a6..febe74e876746 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -684,18 +684,16 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>) -> () { %16 = tosa.select %14, %0, %1 : (tensor<1xi1>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32> // CHECK: linalg.generic - // CHECK: arith.cmpi - // CHECK: select + // CHECK: arith.maxsi %17 = tosa.maximum %0, %1 : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32> // CHECK: linalg.generic - // CHECK: arith.cmpi - // CHECK: select + // CHECK: arith.minsi %18 = tosa.minimum %0, %1 : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32> // CHECK: linalg.generic - // CHECK: arith.cmpi - // CHECK: select + // CHECK-DAG: arith.maxsi + // CHECK-DAG: arith.minsi %19 = tosa.clamp %0 {min_int = 1 : i64, max_int = 5 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xi32>) -> tensor<1xi32> // CHECK: linalg.generic @@ -717,9 +715,8 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>) -> () { // CHECK: linalg.generic // CHECK: arith.constant 0 - // CHECK: arith.cmpi sgt // CHECK: arith.subi - // CHECK: select + // CHECK: arith.maxsi %24 = tosa.abs %arg0 : (tensor<1xi32>) -> tensor<1xi32> return @@ -745,20 +742,16 @@ func.func @test_i8(%arg0: tensor<1xi8>) -> () { // CHECK: ^bb0(%[[ARG1:.+]]: i8, // CHECK-DAG: %[[C127:.+]] = arith.constant -127 // CHECK-DAG: %[[C126:.+]] = arith.constant 126 - // CHECK-DAG: %[[CMP1:.+]] = arith.cmpi slt, %[[ARG1]], %[[C127]] - // CHECK-DAG: %[[SEL1:.+]] = arith.select %[[CMP1]], %[[C127]] - // CHECK-DAG: %[[CMP2:.+]] = arith.cmpi slt, %[[C126]], %[[ARG1]] - // CHECK: %[[SEL2:.+]] = arith.select %[[CMP2]], %[[C126]], %[[SEL1]] + // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C127]], %[[ARG1]] + // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C126]], %[[LOWER]] %0 = tosa.clamp %arg0 {min_int = -127 : i64, max_int = 126 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi8>) -> tensor<1xi8> // CHECK: linalg.generic // CHECK: ^bb0(%[[ARG1:.+]]: i8, // CHECK-DAG: %[[C128:.+]] = arith.constant -128 // CHECK-DAG: %[[C127:.+]] = arith.constant 127 - // CHECK-DAG: %[[CMP1:.+]] = arith.cmpi slt, %[[ARG1]], %[[C128]] - // CHECK-DAG: %[[SEL1:.+]] = arith.select %[[CMP1]], %[[C128]] - // CHECK-DAG: %[[CMP2:.+]] = arith.cmpi slt, %[[C127]], %[[ARG1]] - // CHECK: %[[SEL2:.+]] = arith.select %[[CMP2]], %[[C127]], %[[SEL1]] + // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C128]], %[[ARG1]] + // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C127]], %[[LOWER]] %1 = tosa.clamp %arg0 {min_int = -130 : i64, max_int = 130 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi8>) -> tensor<1xi8> return @@ -814,10 +807,8 @@ func.func @test_negate_quantized(%arg0: tensor<1xi8>) -> () { // CHECK: [[SUB:%.+]] = arith.subi [[ZERO]], [[EXT]] // CHECK: [[MIN:%.+]] = arith.constant -128 // CHECK: [[MAX:%.+]] = arith.constant 127 - // CHECK: [[PRED1:%.+]] = arith.cmpi slt, [[SUB]], [[MIN]] - // CHECK: [[LBOUND:%.+]] = arith.select [[PRED1]], [[MIN]], [[SUB]] - // CHECK: [[PRED2:%.+]] = arith.cmpi slt, [[MAX]], [[SUB]] - // CHECK: [[UBOUND:%.+]] = arith.select [[PRED2]], [[MAX]], [[LBOUND]] + // CHECK: [[LBOUND:%.+]] = arith.maxsi [[MIN]], [[SUB]] + // CHECK: [[UBOUND:%.+]] = arith.minsi [[MAX]], [[LBOUND]] // CHECK: [[TRUNC:%.+]] = arith.trunci [[UBOUND]] // CHECK: linalg.yield [[TRUNC]] %0 = tosa.negate %arg0 {quantization_info = #tosa.unary_quant} : (tensor<1xi8>) -> tensor<1xi8> @@ -1009,15 +1000,13 @@ func.func @reduce_int(%arg0: tensor<5x4xi32>) -> () { // CHECK: arith.constant 2147483647 : i32 // CHECK: linalg.fill // CHECK: linalg.reduce - // CHECK: arith.cmpi slt - // CHECK: select + // CHECK: arith.minsi %3 = tosa.reduce_min %arg0 {axis = 0 : i32} : (tensor<5x4xi32>) -> tensor<1x4xi32> // CHECK: arith.constant -2147483648 : i32 // CHECK: linalg.fill // CHECK: linalg.reduce - // CHECK: arith.cmpi sgt - // CHECK: select + // CHECK: arith.maxsi %4 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<5x4xi32>) -> tensor<1x4xi32> return } @@ -1066,10 +1055,8 @@ func.func @rescale_i8(%arg0 : tensor<2xi8>) -> () { // CHECK-DAG: [[SCALED_ZEROED:%.+]] = arith.addi [[SCALED]], [[C22]] // CHECK-DAG: [[CMIN:%.+]] = arith.constant -128 // CHECK-DAG: [[CMAX:%.+]] = arith.constant 127 - // CHECK-DAG: [[MINLT:%.+]] = arith.cmpi slt, [[SCALED_ZEROED]], [[CMIN]] - // CHECK-DAG: [[MAXLT:%.+]] = arith.cmpi slt, [[CMAX]], [[SCALED_ZEROED]] - // CHECK-DAG: [[LOWER:%.+]] = arith.select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]] - // CHECK-DAG: [[BOUNDED:%.+]] = arith.select [[MAXLT]], [[CMAX]], [[LOWER]] + // CHECK-DAG: [[LOWER:%.+]] = arith.maxsi [[CMIN]], [[SCALED_ZEROED]] + // CHECK-DAG: [[BOUNDED:%.+]] = arith.minsi [[CMAX]], [[LOWER]] // CHECK-DAG: [[TRUNC:%.+]] = arith.trunci [[BOUNDED]] // CHECK-DAG: linalg.yield [[TRUNC]] %0 = tosa.rescale %arg0 {input_zp = 17 : i32, output_zp = 22 : i32, multiplier = array, shift = array, scale32 = false, double_round = false, per_channel = false} : (tensor<2xi8>) -> tensor<2xi8> @@ -1087,10 +1074,8 @@ func.func @rescale_i8(%arg0 : tensor<2xi8>) -> () { // CHECK-DAG: [[SCALED_ZEROED:%.+]] = arith.addi [[SCALED]], [[C22]] // CHECK-DAG: [[CMIN:%.+]] = arith.constant 0 // CHECK-DAG: [[CMAX:%.+]] = arith.constant 255 - // CHECK-DAG: [[MINLT:%.+]] = arith.cmpi slt, [[SCALED_ZEROED]], [[CMIN]] - // CHECK-DAG: [[LOWER:%.+]] = arith.select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]] - // CHECK-DAG: [[MAXLT:%.+]] = arith.cmpi slt, [[CMAX]], [[SCALED_ZEROED]] - // CHECK-DAG: [[BOUNDED:%.+]] = arith.select [[MAXLT]], [[CMAX]], [[LOWER]] + // CHECK-DAG: [[LOWER:%.+]] = arith.maxsi [[CMIN]], [[SCALED_ZEROED]] + // CHECK-DAG: [[BOUNDED:%.+]] = arith.minsi [[CMAX]], [[LOWER]] // CHECK-DAG: [[TRUNC:%.+]] = arith.trunci [[BOUNDED]] // CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[TRUNC]] : i8 to ui8 // CHECK: linalg.yield [[CAST]] @@ -1160,10 +1145,8 @@ func.func @rescale_ui8(%arg0 : tensor<2xui8>) -> () { // CHECK-DAG: [[SCALED_ZEROED:%.+]] = arith.addi [[SCALED]], [[C22]] // CHECK-DAG: [[CMIN:%.+]] = arith.constant -128 // CHECK-DAG: [[CMAX:%.+]] = arith.constant 127 - // CHECK-DAG: [[MINLT:%.+]] = arith.cmpi slt, [[SCALED_ZEROED]], [[CMIN]] - // CHECK-DAG: [[LOWER:%.+]] = arith.select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]] - // CHECK-DAG: [[MAXLT:%.+]] = arith.cmpi slt, [[CMAX]], [[SCALED_ZEROED]] - // CHECK-DAG: [[BOUNDED:%.+]] = arith.select [[MAXLT]], [[CMAX]], [[LOWER]] + // CHECK-DAG: [[LOWER:%.+]] = arith.maxsi [[CMIN]], [[SCALED_ZEROED]] + // CHECK-DAG: [[BOUNDED:%.+]] = arith.minsi [[CMAX]], [[LOWER]] // CHECK-DAG: [[TRUNC:%.+]] = arith.trunci [[BOUNDED]] // CHECK: linalg.yield [[TRUNC]] %0 = tosa.rescale %arg0 {input_zp = 17 : i32, output_zp = 22 : i32, multiplier = array, shift = array, scale32 = false, double_round = false, per_channel = false} : (tensor<2xui8>) -> tensor<2xi8> @@ -1192,10 +1175,8 @@ func.func @rescale_per_channel(%arg0 : tensor<3xi8>) -> (tensor<3xi8>) { // CHECK-DAG: [[SCALED_ZEROED:%.+]] = arith.addi [[SCALED]], [[C252]] // CHECK-DAG: [[CMIN:%.+]] = arith.constant -128 // CHECK-DAG: [[CMAX:%.+]] = arith.constant 127 - // CHECK-DAG: [[MINLT:%.+]] = arith.cmpi slt, [[SCALED_ZEROED]], [[CMIN]] - // CHECK-DAG: [[MAXLT:%.+]] = arith.cmpi slt, [[CMAX]], [[SCALED_ZEROED]] - // CHECK-DAG: [[LOWER:%.+]] = arith.select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]] - // CHECK-DAG: [[BOUNDED:%.+]] = arith.select [[MAXLT]], [[CMAX]], [[LOWER]] + // CHECK-DAG: [[LOWER:%.+]] = arith.maxsi [[CMIN]], [[SCALED_ZEROED]] + // CHECK-DAG: [[BOUNDED:%.+]] = arith.minsi [[CMAX]], [[LOWER]] // CHECK-DAG: [[TRUNC:%.+]] = arith.trunci [[BOUNDED]] // CHECK-DAG: linalg.yield [[TRUNC]] %0 = tosa.rescale %arg0 {input_zp = 243 : i32, output_zp = 252 : i32, multiplier = array, shift = array, scale32 = false, double_round = false, per_channel = false} : (tensor<3xi8>) -> tensor<3xi8> diff --git a/mlir/test/Transforms/parametric-tiling.mlir b/mlir/test/Transforms/parametric-tiling.mlir index e3be41e702ec4..f6cace5397def 100644 --- a/mlir/test/Transforms/parametric-tiling.mlir +++ b/mlir/test/Transforms/parametric-tiling.mlir @@ -40,12 +40,10 @@ func.func @rectangular(%arg0: memref) { scf.for %i = %c2 to %c44 step %c1 { // Upper bound for the inner loop min(%i + %step, %c44). // COMMON: %[[stepped:.*]] = arith.addi %[[i]], %[[step]] - // COMMON-NEXT: arith.cmpi slt, %c44, %[[stepped]] - // COMMON-NEXT: %[[ub:.*]] = arith.select {{.*}}, %c44, %[[stepped]] + // COMMON-NEXT: %[[ub:.*]] = arith.minsi %c44, %[[stepped]] // // TILE_74: %[[stepped2:.*]] = arith.addi %[[j]], %[[step2]] - // TILE_74-NEXT: arith.cmpi slt, %c44, %[[stepped2]] - // TILE_74-NEXT: %[[ub2:.*]] = arith.select {{.*}}, %c44, %[[stepped2]] + // TILE_74-NEXT: %[[ub2:.*]] = arith.minsi %c44, %[[stepped2]] // Created inner scf. // COMMON:scf.for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1 @@ -108,11 +106,9 @@ func.func @triangular(%arg0: memref) { scf.for %i = %c2 to %c44 step %c1 { // Upper bound for the inner loop min(%i + %step, %c44). // COMMON: %[[stepped:.*]] = arith.addi %[[i]], %[[step]] - // COMMON-NEXT: arith.cmpi slt, %c44, %[[stepped]] - // COMMON-NEXT: %[[ub:.*]] = arith.select {{.*}}, %c44, %[[stepped]] + // COMMON-NEXT: %[[ub:.*]] = arith.minsi %c44, %[[stepped]] // TILE_74: %[[stepped2:.*]] = arith.addi %[[j]], %[[step2]] - // TILE_74-NEXT: arith.cmpi slt, %[[i]], %[[stepped2]] - // TILE_74-NEXT: %[[ub2:.*]] = arith.select {{.*}}, %[[i]], %[[stepped2]] + // TILE_74-NEXT: %[[ub2:.*]] = arith.minsi %[[i]], %[[stepped2]] // // Created inner scf. // COMMON:scf.for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1 From cd160a6e98533fbc04a76d1b969db77b49668eb3 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 Feb 2024 20:36:16 +0000 Subject: [PATCH 149/351] [VPlan] Do not add call results with void type to State (NFC). With vector libraries, we may vectorize calls with void return types. Do not add those values to the state; they can never be accessed. --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9ee0cb2bd6153..2d2f6acf913f1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -655,7 +655,8 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { if (isa(V)) V->copyFastMathFlags(&CI); - State.set(this, V, Part); + if (!V->getType()->isVoidTy()) + State.set(this, V, Part); State.addMetadata(V, &CI); } } From 99c457dc2ef395872d7448c85609f6cb73a7f89b Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 21 Feb 2024 12:59:56 -0800 Subject: [PATCH 150/351] Unbreak *tf builtins for hexfloat (#82208) This re-lands cc0065a7d082f0bd322a538cf62cfaef1c8f89f8 in a way that keeps existing targets working. --------- Original commit message: #68132 ended up removing __multc3 & __divtc3 from compiler-rt library builds that have QUAD_PRECISION but not TF_MODE due to missing int128 support. I added support for QUAD_PRECISION to use the native hex float long double representation. --------- Co-authored-by: Sean Perry --- compiler-rt/lib/builtins/divtc3.c | 2 +- compiler-rt/lib/builtins/fp_lib.h | 41 ++++++++++++++++++---------- compiler-rt/lib/builtins/int_types.h | 8 ++++-- compiler-rt/lib/builtins/multc3.c | 2 +- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/compiler-rt/lib/builtins/divtc3.c b/compiler-rt/lib/builtins/divtc3.c index e970cef574b21..099de5802daf0 100644 --- a/compiler-rt/lib/builtins/divtc3.c +++ b/compiler-rt/lib/builtins/divtc3.c @@ -13,7 +13,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_TF_MODE) +#if defined(CRT_HAS_F128) // Returns: the quotient of (a + ib) / (c + id) diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h index af406e760497a..c4f0a5b9587f7 100644 --- a/compiler-rt/lib/builtins/fp_lib.h +++ b/compiler-rt/lib/builtins/fp_lib.h @@ -22,6 +22,7 @@ #include "int_lib.h" #include "int_math.h" +#include "int_types.h" #include #include #include @@ -93,13 +94,14 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b); #elif defined QUAD_PRECISION -#if defined(CRT_HAS_TF_MODE) +#if defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT) typedef uint64_t half_rep_t; typedef __uint128_t rep_t; typedef __int128_t srep_t; typedef tf_float fp_t; #define HALF_REP_C UINT64_C #define REP_C (__uint128_t) +#if defined(CRT_HAS_IEEE_TF) // Note: Since there is no explicit way to tell compiler the constant is a // 128-bit integer, we let the constant be casted to 128-bit integer #define significandBits 112 @@ -188,7 +190,10 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { #undef Word_HiMask #undef Word_LoMask #undef Word_FullMask -#endif // defined(CRT_HAS_TF_MODE) +#endif // defined(CRT_HAS_IEEE_TF) +#else +typedef long double fp_t; +#endif // defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT) #else #error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined. #endif @@ -196,19 +201,6 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) || \ (defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE)) #define typeWidth (sizeof(rep_t) * CHAR_BIT) -#define exponentBits (typeWidth - significandBits - 1) -#define maxExponent ((1 << exponentBits) - 1) -#define exponentBias (maxExponent >> 1) - -#define implicitBit (REP_C(1) << significandBits) -#define significandMask (implicitBit - 1U) -#define signBit (REP_C(1) << (significandBits + exponentBits)) -#define absMask (signBit - 1U) -#define exponentMask (absMask ^ significandMask) -#define oneRep ((rep_t)exponentBias << significandBits) -#define infRep exponentMask -#define quietBit (implicitBit >> 1) -#define qnanRep (exponentMask | quietBit) static __inline rep_t toRep(fp_t x) { const union { @@ -226,6 +218,21 @@ static __inline fp_t fromRep(rep_t x) { return rep.f; } +#if !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF) +#define exponentBits (typeWidth - significandBits - 1) +#define maxExponent ((1 << exponentBits) - 1) +#define exponentBias (maxExponent >> 1) + +#define implicitBit (REP_C(1) << significandBits) +#define significandMask (implicitBit - 1U) +#define signBit (REP_C(1) << (significandBits + exponentBits)) +#define absMask (signBit - 1U) +#define exponentMask (absMask ^ significandMask) +#define oneRep ((rep_t)exponentBias << significandBits) +#define infRep exponentMask +#define quietBit (implicitBit >> 1) +#define qnanRep (exponentMask | quietBit) + static __inline int normalize(rep_t *significand) { const int shift = rep_clz(*significand) - rep_clz(implicitBit); *significand <<= shift; @@ -328,6 +335,8 @@ static __inline fp_t __compiler_rt_scalbnX(fp_t x, int y) { return fromRep(sign | ((rep_t)exp << significandBits) | sig); } +#endif // !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF) + // Avoid using fmax from libm. static __inline fp_t __compiler_rt_fmaxX(fp_t x, fp_t y) { // If either argument is NaN, return the other argument. If both are NaN, @@ -405,6 +414,8 @@ static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) { #define __compiler_rt_logbl crt_logbl #define __compiler_rt_scalbnl crt_scalbnl #define __compiler_rt_fmaxl crt_fmaxl +#define crt_fabstf crt_fabsl +#define crt_copysigntf crt_copysignl #else #error Unsupported TF mode type #endif diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h index 7624c72806151..ca97391fc2846 100644 --- a/compiler-rt/lib/builtins/int_types.h +++ b/compiler-rt/lib/builtins/int_types.h @@ -189,12 +189,16 @@ typedef long double tf_float; #define CRT_LDBL_IEEE_F128 #endif #define TF_C(x) x##L -#elif __LDBL_MANT_DIG__ == 113 -// Use long double instead of __float128 if it matches the IEEE 128-bit format. +#elif __LDBL_MANT_DIG__ == 113 || \ + (__FLT_RADIX__ == 16 && __LDBL_MANT_DIG__ == 28) +// Use long double instead of __float128 if it matches the IEEE 128-bit format +// or the IBM hexadecimal format. #define CRT_LDBL_128BIT #define CRT_HAS_F128 +#if __LDBL_MANT_DIG__ == 113 #define CRT_HAS_IEEE_TF #define CRT_LDBL_IEEE_F128 +#endif typedef long double tf_float; #define TF_C(x) x##L #elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__) diff --git a/compiler-rt/lib/builtins/multc3.c b/compiler-rt/lib/builtins/multc3.c index f20e53ccbf233..61a3f45e47279 100644 --- a/compiler-rt/lib/builtins/multc3.c +++ b/compiler-rt/lib/builtins/multc3.c @@ -15,7 +15,7 @@ #include "int_lib.h" #include "int_math.h" -#if defined(CRT_HAS_TF_MODE) +#if defined(CRT_HAS_F128) // Returns: the product of a + ib and c + id From 81b4b89197a6be5f19f907b558540bb3cb70f064 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 21 Feb 2024 13:00:08 -0800 Subject: [PATCH 151/351] [Sanitizer] Support -fwrapv with -fsanitize=signed-integer-overflow (#82432) Clang has a `signed-integer-overflow` sanitizer to catch arithmetic overflow; however, most of its instrumentation [fails to apply](https://godbolt.org/z/ee41rE8o6) when `-fwrapv` is enabled; this is by design. The Linux kernel enables `-fno-strict-overflow` which implies `-fwrapv`. This means we are [currently unable to detect signed-integer wrap-around](https://github.com/KSPP/linux/issues/26). All the while, the root cause of many security vulnerabilities in the Linux kernel is [arithmetic overflow](https://cwe.mitre.org/data/definitions/190.html). To work around this and enhance the functionality of `-fsanitize=signed-integer-overflow`, we instrument signed arithmetic even if the signed overflow behavior is defined. Co-authored-by: Justin Stitt --- clang/docs/ReleaseNotes.rst | 8 ++++++++ clang/docs/UndefinedBehaviorSanitizer.rst | 9 +++++---- clang/lib/CodeGen/CGExprScalar.cpp | 16 ++++++++++++---- clang/test/CodeGen/integer-overflow.c | 6 ++++-- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index dd217e16f1f1a..ef2d9b8e46ae4 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -408,6 +408,14 @@ Moved checkers Sanitizers ---------- +- ``-fsanitize=signed-integer-overflow`` now instruments signed arithmetic even + when ``-fwrapv`` is enabled. Previously, only division checks were enabled. + + Users with ``-fwrapv`` as well as a sanitizer group like + ``-fsanitize=undefined`` or ``-fsanitize=integer`` enabled may want to + manually disable potentially noisy signed integer overflow checks with + ``-fno-sanitize=signed-integer-overflow`` + Python Binding Changes ---------------------- diff --git a/clang/docs/UndefinedBehaviorSanitizer.rst b/clang/docs/UndefinedBehaviorSanitizer.rst index b8ad3804f1890..8f58c92bd2a16 100644 --- a/clang/docs/UndefinedBehaviorSanitizer.rst +++ b/clang/docs/UndefinedBehaviorSanitizer.rst @@ -190,10 +190,11 @@ Available checks are: - ``-fsanitize=signed-integer-overflow``: Signed integer overflow, where the result of a signed integer computation cannot be represented in its type. This includes all the checks covered by ``-ftrapv``, as well as checks for - signed division overflow (``INT_MIN/-1``), but not checks for - lossy implicit conversions performed before the computation - (see ``-fsanitize=implicit-conversion``). Both of these two issues are - handled by ``-fsanitize=implicit-conversion`` group of checks. + signed division overflow (``INT_MIN/-1``). Note that checks are still + added even when ``-fwrapv`` is enabled. This sanitizer does not check for + lossy implicit conversions performed before the computation (see + ``-fsanitize=implicit-conversion``). Both of these two issues are handled + by ``-fsanitize=implicit-conversion`` group of checks. - ``-fsanitize=unreachable``: If control flow reaches an unreachable program point. - ``-fsanitize=unsigned-integer-overflow``: Unsigned integer overflow, where diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 576734e460b9c..10b7457522044 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -723,7 +723,9 @@ class ScalarExprEmitter if (Ops.Ty->isSignedIntegerOrEnumerationType()) { switch (CGF.getLangOpts().getSignedOverflowBehavior()) { case LangOptions::SOB_Defined: - return Builder.CreateMul(Ops.LHS, Ops.RHS, "mul"); + if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) + return Builder.CreateMul(Ops.LHS, Ops.RHS, "mul"); + [[fallthrough]]; case LangOptions::SOB_Undefined: if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) return Builder.CreateNSWMul(Ops.LHS, Ops.RHS, "mul"); @@ -2568,7 +2570,9 @@ llvm::Value *ScalarExprEmitter::EmitIncDecConsiderOverflowBehavior( StringRef Name = IsInc ? "inc" : "dec"; switch (CGF.getLangOpts().getSignedOverflowBehavior()) { case LangOptions::SOB_Defined: - return Builder.CreateAdd(InVal, Amount, Name); + if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) + return Builder.CreateAdd(InVal, Amount, Name); + [[fallthrough]]; case LangOptions::SOB_Undefined: if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) return Builder.CreateNSWAdd(InVal, Amount, Name); @@ -3913,7 +3917,9 @@ Value *ScalarExprEmitter::EmitAdd(const BinOpInfo &op) { if (op.Ty->isSignedIntegerOrEnumerationType()) { switch (CGF.getLangOpts().getSignedOverflowBehavior()) { case LangOptions::SOB_Defined: - return Builder.CreateAdd(op.LHS, op.RHS, "add"); + if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) + return Builder.CreateAdd(op.LHS, op.RHS, "add"); + [[fallthrough]]; case LangOptions::SOB_Undefined: if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) return Builder.CreateNSWAdd(op.LHS, op.RHS, "add"); @@ -4067,7 +4073,9 @@ Value *ScalarExprEmitter::EmitSub(const BinOpInfo &op) { if (op.Ty->isSignedIntegerOrEnumerationType()) { switch (CGF.getLangOpts().getSignedOverflowBehavior()) { case LangOptions::SOB_Defined: - return Builder.CreateSub(op.LHS, op.RHS, "sub"); + if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) + return Builder.CreateSub(op.LHS, op.RHS, "sub"); + [[fallthrough]]; case LangOptions::SOB_Undefined: if (!CGF.SanOpts.has(SanitizerKind::SignedIntegerOverflow)) return Builder.CreateNSWSub(op.LHS, op.RHS, "sub"); diff --git a/clang/test/CodeGen/integer-overflow.c b/clang/test/CodeGen/integer-overflow.c index 9a3107c0b5292..461b026d39615 100644 --- a/clang/test/CodeGen/integer-overflow.c +++ b/clang/test/CodeGen/integer-overflow.c @@ -1,7 +1,8 @@ // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - | FileCheck %s --check-prefix=DEFAULT // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fwrapv | FileCheck %s --check-prefix=WRAPV // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -ftrapv | FileCheck %s --check-prefix=TRAPV -// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fsanitize=signed-integer-overflow | FileCheck %s --check-prefix=CATCH_UB +// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fsanitize=signed-integer-overflow | FileCheck %s --check-prefixes=CATCH_UB,CATCH_UB_POINTER +// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fsanitize=signed-integer-overflow -fwrapv | FileCheck %s --check-prefixes=CATCH_UB,NOCATCH_UB_POINTER // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -ftrapv -ftrapv-handler foo | FileCheck %s --check-prefix=TRAPV_HANDLER @@ -62,7 +63,8 @@ void test1(void) { // DEFAULT: getelementptr inbounds i32, ptr // WRAPV: getelementptr i32, ptr // TRAPV: getelementptr inbounds i32, ptr - // CATCH_UB: getelementptr inbounds i32, ptr + // CATCH_UB_POINTER: getelementptr inbounds i32, ptr + // NOCATCH_UB_POINTER: getelementptr i32, ptr // PR9350: char pre-increment never overflows. extern volatile signed char PR9350_char_inc; From c63e68ba5fb54b69521c4f010d1c5290856c6509 Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Wed, 21 Feb 2024 13:42:10 -0800 Subject: [PATCH 152/351] Bump the minimum LLVM version for TestTypeList.py --- lldb/test/API/python_api/type/TestTypeList.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py index e75affd652211..eba5e17355c3f 100644 --- a/lldb/test/API/python_api/type/TestTypeList.py +++ b/lldb/test/API/python_api/type/TestTypeList.py @@ -18,6 +18,7 @@ def setUp(self): self.source = "main.cpp" self.line = line_number(self.source, "// Break at this line") + @skipIf(compiler="clang", compiler_version=["<", "17.0"]) def test(self): """Exercise SBType and SBTypeList API.""" d = {"EXE": self.exe_name} From 2b2881b0ae94e56aa019b519419d122bb7b81462 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 21 Feb 2024 14:16:27 -0800 Subject: [PATCH 153/351] Add namespace qualifier for llvm::StringRef --- clang/include/clang/InstallAPI/Context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h index b06168918a613..7d105920734fd 100644 --- a/clang/include/clang/InstallAPI/Context.h +++ b/clang/include/clang/InstallAPI/Context.h @@ -28,7 +28,7 @@ struct InstallAPIContext { llvm::Triple TargetTriple{}; /// File Path of output location. - StringRef OutputLoc{}; + llvm::StringRef OutputLoc{}; /// What encoding to write output as. llvm::MachO::FileType FT = llvm::MachO::FileType::TBD_V5; From 9eff001d3dbe84851caa7de4e1093af62c009e06 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 22:14:04 +0000 Subject: [PATCH 154/351] [TargetLowering] Correctly yield NaN from FP_TO_BF16 We didn't set the exponent field, resulting in tiny numbers instead of NaNs. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 9 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 1864 ++++++++--------- .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 4 +- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 20 +- .../isel-amdgpu-cs-chain-preserve-cc.ll | 8 +- llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 8 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 20 +- 7 files changed, 966 insertions(+), 967 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index bde1fff4e1ca7..a4c5167ade376 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10948,12 +10948,11 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const { Op = expandRoundInexactToOdd(F32, Op, dl, DAG); Op = DAG.getNode(ISD::BITCAST, dl, I32, Op); - // Extract the sign bit. - SDValue SignBit = - DAG.getNode(ISD::AND, dl, I32, Op, - DAG.getConstant(APInt::getSignMask(32), dl, I32)); + // Extract the sign bit and exponent. + SDValue SignBitAndExponentField = DAG.getNode( + ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32)); // Set the quiet bit. - SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBit, + SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField, DAG.getConstant(0x400000, dl, I32)); // Factor in the contribution of the low 16 bits. diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 67538f26c550b..63a09e49e0051 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2182,7 +2182,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -2199,7 +2199,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -2212,7 +2212,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 @@ -2226,7 +2226,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 @@ -2294,7 +2294,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2323,7 +2323,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4 ; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v5 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off @@ -2343,14 +2343,14 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5 ; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0x400000 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] ; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 ; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v5, s4 +; GFX10-NEXT: v_and_or_b32 v5, v5, s4, 0x400000 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2371,7 +2371,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0x400000 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo @@ -2380,7 +2380,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v5, s0 +; GFX11-NEXT: v_and_or_b32 v5, v5, s0, 0x400000 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8999,7 +8999,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -9014,7 +9014,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -9027,7 +9027,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -9042,7 +9042,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -9104,7 +9104,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -9112,7 +9112,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -9128,7 +9128,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 @@ -9137,7 +9137,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -9153,7 +9153,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -9176,7 +9176,7 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_add_f32_e32 v2, v3, v2 @@ -9255,7 +9255,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -9268,7 +9268,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -9276,7 +9276,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -9293,7 +9293,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -9302,7 +9302,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 @@ -9311,7 +9311,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -9331,7 +9331,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 @@ -9426,7 +9426,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -9435,7 +9435,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -9447,7 +9447,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -9455,7 +9455,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -9473,7 +9473,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 @@ -9482,7 +9482,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -9491,7 +9491,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -9500,7 +9500,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -9525,7 +9525,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 ; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 @@ -9560,7 +9560,7 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_add_f32_e32 v4, v5, v4 @@ -9717,7 +9717,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -9726,7 +9726,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc @@ -9738,7 +9738,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -9746,7 +9746,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc @@ -9758,7 +9758,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -9766,7 +9766,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc @@ -9778,7 +9778,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -9786,7 +9786,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc @@ -9808,7 +9808,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 @@ -9817,7 +9817,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -9826,7 +9826,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 @@ -9835,7 +9835,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -9844,7 +9844,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 @@ -9853,7 +9853,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -9862,7 +9862,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 @@ -9871,7 +9871,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -9896,7 +9896,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 ; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 ; GFX10-NEXT: v_add_f32_e32 v9, v11, v9 @@ -9967,7 +9967,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 ; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 @@ -10263,7 +10263,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v15 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 @@ -10271,7 +10271,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc @@ -10283,7 +10283,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v14 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 @@ -10291,7 +10291,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc @@ -10303,7 +10303,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v13 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -10311,7 +10311,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc @@ -10323,7 +10323,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v12 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -10331,7 +10331,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc @@ -10343,7 +10343,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -10351,7 +10351,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc @@ -10363,7 +10363,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -10371,7 +10371,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc @@ -10383,7 +10383,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -10391,7 +10391,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc @@ -10403,7 +10403,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -10411,7 +10411,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc @@ -10441,7 +10441,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 @@ -10450,7 +10450,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -10459,7 +10459,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 @@ -10468,7 +10468,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -10477,7 +10477,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 @@ -10486,7 +10486,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -10495,7 +10495,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 @@ -10504,7 +10504,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -10513,7 +10513,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 @@ -10522,7 +10522,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -10531,7 +10531,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 @@ -10540,7 +10540,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -10549,7 +10549,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 @@ -10558,7 +10558,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -10567,7 +10567,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 @@ -10576,7 +10576,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -10599,7 +10599,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_add_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 @@ -10742,7 +10742,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -11434,7 +11434,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 @@ -11442,7 +11442,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc @@ -11465,14 +11465,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_add_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -11480,13 +11480,13 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc @@ -11498,7 +11498,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_add_f32_e32 v12, v12, v28 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 @@ -11506,7 +11506,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc @@ -11518,7 +11518,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_add_f32_e32 v11, v11, v27 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 @@ -11526,7 +11526,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc @@ -11538,7 +11538,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_add_f32_e32 v10, v10, v26 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 @@ -11546,7 +11546,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc @@ -11558,7 +11558,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 @@ -11566,7 +11566,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc @@ -11578,7 +11578,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_add_f32_e32 v8, v8, v24 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 @@ -11586,7 +11586,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc @@ -11598,7 +11598,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v23 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 @@ -11606,7 +11606,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc @@ -11618,7 +11618,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 @@ -11626,7 +11626,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc @@ -11638,7 +11638,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v21 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 @@ -11646,7 +11646,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc @@ -11658,7 +11658,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 @@ -11666,7 +11666,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc @@ -11678,7 +11678,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v19 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 @@ -11686,7 +11686,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc @@ -11698,7 +11698,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v18 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 @@ -11706,7 +11706,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc @@ -11718,7 +11718,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v17 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 @@ -11726,7 +11726,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc @@ -11738,7 +11738,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -11746,7 +11746,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc @@ -11790,7 +11790,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 @@ -11799,7 +11799,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -11812,7 +11812,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -11825,19 +11825,19 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 ; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 ; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -11846,7 +11846,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_add_f32_e32 v32, v33, v32 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 @@ -11855,7 +11855,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -11864,7 +11864,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 @@ -11873,7 +11873,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -11882,7 +11882,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 @@ -11891,7 +11891,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -11900,7 +11900,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 @@ -11909,7 +11909,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -11918,7 +11918,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 @@ -11927,7 +11927,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -11936,7 +11936,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 @@ -11945,7 +11945,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -11954,7 +11954,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 @@ -11963,7 +11963,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -11972,7 +11972,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 @@ -11981,7 +11981,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -11990,7 +11990,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 @@ -11999,7 +11999,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -12008,7 +12008,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 @@ -12017,7 +12017,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -12026,7 +12026,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 @@ -12035,7 +12035,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -12044,7 +12044,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 @@ -12053,7 +12053,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -12062,7 +12062,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 @@ -12071,7 +12071,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -12163,7 +12163,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -12385,7 +12385,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -12678,7 +12678,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -12692,7 +12692,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -12704,7 +12704,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -12718,7 +12718,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -12761,7 +12761,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -12775,7 +12775,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -12787,7 +12787,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -12801,7 +12801,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -12849,7 +12849,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -12864,7 +12864,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -12877,7 +12877,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -12892,7 +12892,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -12954,7 +12954,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -12962,7 +12962,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -12978,7 +12978,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 @@ -12987,7 +12987,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -13003,7 +13003,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -13026,7 +13026,7 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2 @@ -13105,7 +13105,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -13118,7 +13118,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -13126,7 +13126,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -13143,7 +13143,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -13152,7 +13152,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 @@ -13161,7 +13161,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -13181,7 +13181,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 @@ -13276,7 +13276,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -13285,7 +13285,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -13297,7 +13297,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -13305,7 +13305,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -13323,7 +13323,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 @@ -13332,7 +13332,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -13341,7 +13341,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -13350,7 +13350,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -13375,7 +13375,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 ; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 @@ -13410,7 +13410,7 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_sub_f32_e32 v4, v5, v4 @@ -13481,7 +13481,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -13496,7 +13496,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -13509,7 +13509,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -13524,7 +13524,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -13586,7 +13586,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -13594,7 +13594,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -13610,7 +13610,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 @@ -13619,7 +13619,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -13635,7 +13635,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -13658,7 +13658,7 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 @@ -13737,7 +13737,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -13750,7 +13750,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -13758,7 +13758,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -13775,7 +13775,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -13784,7 +13784,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 @@ -13793,7 +13793,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -13813,7 +13813,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 @@ -13908,7 +13908,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -13917,7 +13917,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -13929,7 +13929,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -13937,7 +13937,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -13955,7 +13955,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 @@ -13964,7 +13964,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -13973,7 +13973,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -13982,7 +13982,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -14007,7 +14007,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_f32_e32 v5, v7, v6 ; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 @@ -14042,7 +14042,7 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 @@ -14199,7 +14199,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -14208,7 +14208,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc @@ -14220,7 +14220,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -14228,7 +14228,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc @@ -14240,7 +14240,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -14248,7 +14248,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc @@ -14260,7 +14260,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -14268,7 +14268,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc @@ -14290,7 +14290,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 @@ -14299,7 +14299,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -14308,7 +14308,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 @@ -14317,7 +14317,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -14326,7 +14326,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 @@ -14335,7 +14335,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -14344,7 +14344,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 @@ -14353,7 +14353,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -14378,7 +14378,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 ; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 ; GFX10-NEXT: v_mul_f32_e32 v9, v11, v9 @@ -14449,7 +14449,7 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 ; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 @@ -14745,7 +14745,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 @@ -14753,7 +14753,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc @@ -14765,7 +14765,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 @@ -14773,7 +14773,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc @@ -14785,7 +14785,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -14793,7 +14793,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc @@ -14805,7 +14805,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -14813,7 +14813,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc @@ -14825,7 +14825,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -14833,7 +14833,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc @@ -14845,7 +14845,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -14853,7 +14853,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc @@ -14865,7 +14865,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -14873,7 +14873,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc @@ -14885,7 +14885,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -14893,7 +14893,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc @@ -14923,7 +14923,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 @@ -14932,7 +14932,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -14941,7 +14941,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 @@ -14950,7 +14950,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -14959,7 +14959,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 @@ -14968,7 +14968,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -14977,7 +14977,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 @@ -14986,7 +14986,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -14995,7 +14995,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 @@ -15004,7 +15004,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -15013,7 +15013,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 @@ -15022,7 +15022,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -15031,7 +15031,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 @@ -15040,7 +15040,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -15049,7 +15049,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 @@ -15058,7 +15058,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -15081,7 +15081,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 @@ -15224,7 +15224,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -15916,7 +15916,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 @@ -15924,7 +15924,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc @@ -15947,14 +15947,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -15962,13 +15962,13 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc @@ -15980,7 +15980,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 @@ -15988,7 +15988,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc @@ -16000,7 +16000,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 @@ -16008,7 +16008,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc @@ -16020,7 +16020,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 @@ -16028,7 +16028,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc @@ -16040,7 +16040,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 @@ -16048,7 +16048,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc @@ -16060,7 +16060,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 @@ -16068,7 +16068,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc @@ -16080,7 +16080,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 @@ -16088,7 +16088,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc @@ -16100,7 +16100,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 @@ -16108,7 +16108,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc @@ -16120,7 +16120,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 @@ -16128,7 +16128,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc @@ -16140,7 +16140,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 @@ -16148,7 +16148,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc @@ -16160,7 +16160,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 @@ -16168,7 +16168,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc @@ -16180,7 +16180,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 @@ -16188,7 +16188,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc @@ -16200,7 +16200,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 @@ -16208,7 +16208,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc @@ -16220,7 +16220,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -16228,7 +16228,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc @@ -16272,7 +16272,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 @@ -16281,7 +16281,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -16294,7 +16294,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -16307,19 +16307,19 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 ; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 ; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -16328,7 +16328,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_mul_f32_e32 v32, v33, v32 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 @@ -16337,7 +16337,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -16346,7 +16346,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 @@ -16355,7 +16355,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -16364,7 +16364,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 @@ -16373,7 +16373,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -16382,7 +16382,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 @@ -16391,7 +16391,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -16400,7 +16400,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 @@ -16409,7 +16409,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -16418,7 +16418,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 @@ -16427,7 +16427,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -16436,7 +16436,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 @@ -16445,7 +16445,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -16454,7 +16454,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 @@ -16463,7 +16463,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -16472,7 +16472,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 @@ -16481,7 +16481,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -16490,7 +16490,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 @@ -16499,7 +16499,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -16508,7 +16508,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 @@ -16517,7 +16517,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -16526,7 +16526,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 @@ -16535,7 +16535,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -16544,7 +16544,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 @@ -16553,7 +16553,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -16645,7 +16645,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -16867,7 +16867,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -17194,7 +17194,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -17220,7 +17220,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -17235,7 +17235,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -17258,7 +17258,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 @@ -17637,7 +17637,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -17652,7 +17652,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -17665,7 +17665,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -17680,7 +17680,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -17750,7 +17750,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -17758,7 +17758,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -17774,7 +17774,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 @@ -17783,7 +17783,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -17799,7 +17799,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -17822,7 +17822,7 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v2, v3, v2 @@ -17913,7 +17913,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -17926,7 +17926,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -17934,7 +17934,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -17951,7 +17951,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -17960,7 +17960,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 @@ -17969,7 +17969,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -17989,7 +17989,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 @@ -18100,7 +18100,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -18109,7 +18109,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -18121,7 +18121,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -18129,7 +18129,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -18147,7 +18147,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 @@ -18156,7 +18156,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -18165,7 +18165,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -18174,7 +18174,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -18199,7 +18199,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 ; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 @@ -18234,7 +18234,7 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_min_f32_e32 v4, v5, v4 @@ -18423,7 +18423,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -18432,7 +18432,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc @@ -18444,7 +18444,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -18452,7 +18452,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc @@ -18464,7 +18464,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -18472,7 +18472,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc @@ -18484,7 +18484,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -18492,7 +18492,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc @@ -18514,7 +18514,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 @@ -18523,7 +18523,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -18532,7 +18532,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 @@ -18541,7 +18541,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -18550,7 +18550,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 @@ -18559,7 +18559,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -18568,7 +18568,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 @@ -18577,7 +18577,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -18602,7 +18602,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 ; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 ; GFX10-NEXT: v_min_f32_e32 v9, v11, v9 @@ -18673,7 +18673,7 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 ; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 @@ -19033,7 +19033,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 @@ -19041,7 +19041,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc @@ -19053,7 +19053,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 @@ -19061,7 +19061,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc @@ -19073,7 +19073,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v13 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -19081,7 +19081,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc @@ -19093,7 +19093,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v12 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -19101,7 +19101,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc @@ -19113,7 +19113,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -19121,7 +19121,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc @@ -19133,7 +19133,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -19141,7 +19141,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc @@ -19153,7 +19153,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -19161,7 +19161,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc @@ -19173,7 +19173,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -19181,7 +19181,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc @@ -19211,7 +19211,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 @@ -19220,7 +19220,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -19229,7 +19229,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 @@ -19238,7 +19238,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -19247,7 +19247,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 @@ -19256,7 +19256,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -19265,7 +19265,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 @@ -19274,7 +19274,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -19283,7 +19283,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 @@ -19292,7 +19292,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -19301,7 +19301,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 @@ -19310,7 +19310,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -19319,7 +19319,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 @@ -19328,7 +19328,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -19337,7 +19337,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 @@ -19346,7 +19346,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -19369,7 +19369,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_min_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 @@ -19512,7 +19512,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -20332,7 +20332,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 @@ -20340,7 +20340,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc @@ -20363,14 +20363,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_min_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -20378,13 +20378,13 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc @@ -20396,7 +20396,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 @@ -20404,7 +20404,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc @@ -20416,7 +20416,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 @@ -20424,7 +20424,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc @@ -20436,7 +20436,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 @@ -20444,7 +20444,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc @@ -20456,7 +20456,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 @@ -20464,7 +20464,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc @@ -20476,7 +20476,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 @@ -20484,7 +20484,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc @@ -20496,7 +20496,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 @@ -20504,7 +20504,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc @@ -20516,7 +20516,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 @@ -20524,7 +20524,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc @@ -20536,7 +20536,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 @@ -20544,7 +20544,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc @@ -20556,7 +20556,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 @@ -20564,7 +20564,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc @@ -20576,7 +20576,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 @@ -20584,7 +20584,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc @@ -20596,7 +20596,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 @@ -20604,7 +20604,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc @@ -20616,7 +20616,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 @@ -20624,7 +20624,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc @@ -20636,7 +20636,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -20644,7 +20644,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc @@ -20688,7 +20688,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 @@ -20697,7 +20697,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -20710,7 +20710,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -20723,19 +20723,19 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 ; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 ; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -20744,7 +20744,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_min_f32_e32 v32, v33, v32 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 @@ -20753,7 +20753,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -20762,7 +20762,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 @@ -20771,7 +20771,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -20780,7 +20780,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 @@ -20789,7 +20789,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -20798,7 +20798,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 @@ -20807,7 +20807,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -20816,7 +20816,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 @@ -20825,7 +20825,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -20834,7 +20834,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 @@ -20843,7 +20843,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -20852,7 +20852,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 @@ -20861,7 +20861,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -20870,7 +20870,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 @@ -20879,7 +20879,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -20888,7 +20888,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 @@ -20897,7 +20897,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -20906,7 +20906,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 @@ -20915,7 +20915,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -20924,7 +20924,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 @@ -20933,7 +20933,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -20942,7 +20942,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 @@ -20951,7 +20951,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -20960,7 +20960,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 @@ -20969,7 +20969,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -21061,7 +21061,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -21283,7 +21283,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -21594,7 +21594,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -21609,7 +21609,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -21622,7 +21622,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -21637,7 +21637,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -21707,7 +21707,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -21715,7 +21715,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -21731,7 +21731,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 @@ -21740,7 +21740,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -21756,7 +21756,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -21779,7 +21779,7 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_max_f32_e32 v2, v3, v2 @@ -21870,7 +21870,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -21883,7 +21883,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -21891,7 +21891,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -21908,7 +21908,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -21917,7 +21917,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 @@ -21926,7 +21926,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -21946,7 +21946,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 @@ -22057,7 +22057,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -22066,7 +22066,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -22078,7 +22078,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -22086,7 +22086,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -22104,7 +22104,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 @@ -22113,7 +22113,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -22122,7 +22122,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -22131,7 +22131,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -22156,7 +22156,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 ; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 @@ -22191,7 +22191,7 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_max_f32_e32 v4, v5, v4 @@ -22380,7 +22380,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -22389,7 +22389,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc @@ -22401,7 +22401,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -22409,7 +22409,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc @@ -22421,7 +22421,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -22429,7 +22429,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc @@ -22441,7 +22441,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -22449,7 +22449,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc @@ -22471,7 +22471,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 ; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 @@ -22480,7 +22480,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -22489,7 +22489,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 @@ -22498,7 +22498,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -22507,7 +22507,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 ; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 @@ -22516,7 +22516,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -22525,7 +22525,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 ; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 @@ -22534,7 +22534,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -22559,7 +22559,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 ; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 ; GFX10-NEXT: v_max_f32_e32 v9, v11, v9 @@ -22630,7 +22630,7 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 ; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 @@ -22990,7 +22990,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 @@ -22998,7 +22998,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc @@ -23010,7 +23010,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 @@ -23018,7 +23018,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc @@ -23030,7 +23030,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v13 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -23038,7 +23038,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc @@ -23050,7 +23050,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v12 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -23058,7 +23058,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc @@ -23070,7 +23070,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -23078,7 +23078,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc @@ -23090,7 +23090,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -23098,7 +23098,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc @@ -23110,7 +23110,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -23118,7 +23118,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc @@ -23130,7 +23130,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -23138,7 +23138,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc @@ -23168,7 +23168,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 ; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v16 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 @@ -23177,7 +23177,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -23186,7 +23186,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 ; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v15 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 @@ -23195,7 +23195,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -23204,7 +23204,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 ; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 @@ -23213,7 +23213,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -23222,7 +23222,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 ; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 @@ -23231,7 +23231,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -23240,7 +23240,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 ; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 @@ -23249,7 +23249,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -23258,7 +23258,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 ; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 @@ -23267,7 +23267,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -23276,7 +23276,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 ; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 @@ -23285,7 +23285,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -23294,7 +23294,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 ; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 @@ -23303,7 +23303,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -23326,7 +23326,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 ; GFX10-NEXT: v_max_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 @@ -23469,7 +23469,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 @@ -24289,7 +24289,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 @@ -24297,7 +24297,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc @@ -24320,14 +24320,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_max_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v33 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v30 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -24335,13 +24335,13 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc @@ -24353,7 +24353,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v29 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 @@ -24361,7 +24361,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc @@ -24373,7 +24373,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 @@ -24381,7 +24381,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc @@ -24393,7 +24393,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 @@ -24401,7 +24401,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc @@ -24413,7 +24413,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 @@ -24421,7 +24421,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc @@ -24433,7 +24433,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 @@ -24441,7 +24441,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc @@ -24453,7 +24453,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 @@ -24461,7 +24461,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc @@ -24473,7 +24473,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 @@ -24481,7 +24481,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc @@ -24493,7 +24493,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 @@ -24501,7 +24501,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc @@ -24513,7 +24513,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 @@ -24521,7 +24521,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc @@ -24533,7 +24533,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 @@ -24541,7 +24541,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc @@ -24553,7 +24553,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 @@ -24561,7 +24561,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc @@ -24573,7 +24573,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 @@ -24581,7 +24581,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc @@ -24593,7 +24593,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -24601,7 +24601,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc @@ -24645,7 +24645,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 @@ -24654,7 +24654,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0x80000000, v14 +; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 @@ -24667,7 +24667,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 ; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 @@ -24680,19 +24680,19 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 ; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 ; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 ; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 @@ -24701,7 +24701,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_max_f32_e32 v32, v33, v32 ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v32 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 @@ -24710,7 +24710,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 ; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 @@ -24719,7 +24719,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 ; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v28 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 @@ -24728,7 +24728,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 @@ -24737,7 +24737,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 ; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v27 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 @@ -24746,7 +24746,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 @@ -24755,7 +24755,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 ; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v26 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 @@ -24764,7 +24764,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 @@ -24773,7 +24773,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 ; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v25 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 @@ -24782,7 +24782,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -24791,7 +24791,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 ; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v24 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 @@ -24800,7 +24800,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 @@ -24809,7 +24809,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 ; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v23 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 @@ -24818,7 +24818,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -24827,7 +24827,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 ; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v22 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 @@ -24836,7 +24836,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -24845,7 +24845,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 ; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v21 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 @@ -24854,7 +24854,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -24863,7 +24863,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 ; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v20 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 @@ -24872,7 +24872,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -24881,7 +24881,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 ; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v19 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 @@ -24890,7 +24890,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -24899,7 +24899,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 ; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v18 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 @@ -24908,7 +24908,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -24917,7 +24917,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 ; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0x80000000, v17 +; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 @@ -24926,7 +24926,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -25018,7 +25018,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_brev_b32 s23, 1 +; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -25240,7 +25240,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -25586,7 +25586,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -25617,7 +25617,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -25641,7 +25641,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4 ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 @@ -25676,7 +25676,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo @@ -25724,7 +25724,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -25738,7 +25738,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -25750,7 +25750,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -25764,7 +25764,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -25816,7 +25816,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -25831,7 +25831,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -25844,7 +25844,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -25947,7 +25947,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -25978,7 +25978,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -25990,7 +25990,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -26015,7 +26015,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo @@ -26097,7 +26097,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26119,7 +26119,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26131,7 +26131,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo @@ -26150,7 +26150,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo @@ -26257,7 +26257,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26288,7 +26288,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26300,7 +26300,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -26325,7 +26325,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo @@ -26442,7 +26442,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26473,7 +26473,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26485,7 +26485,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1 @@ -26511,7 +26511,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1 @@ -26594,7 +26594,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26616,7 +26616,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26628,7 +26628,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo @@ -26647,7 +26647,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo @@ -26752,7 +26752,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26783,7 +26783,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26795,7 +26795,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1 @@ -26821,7 +26821,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1 @@ -26882,7 +26882,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26896,7 +26896,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX9-NEXT: v_ceil_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26908,7 +26908,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ceil_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -26922,7 +26922,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ceil_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -26967,7 +26967,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -26981,7 +26981,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX9-NEXT: v_trunc_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -26993,7 +26993,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -27007,7 +27007,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -27052,7 +27052,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -27066,7 +27066,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -27078,7 +27078,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -27092,7 +27092,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -27137,7 +27137,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -27151,7 +27151,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -27163,7 +27163,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -27177,7 +27177,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -27240,7 +27240,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -27260,7 +27260,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -27276,7 +27276,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -27297,7 +27297,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 @@ -27343,7 +27343,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -27357,7 +27357,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -27369,7 +27369,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -27383,7 +27383,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -27428,7 +27428,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -27442,7 +27442,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX9-NEXT: v_floor_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -27454,7 +27454,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_floor_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -27468,7 +27468,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_floor_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -27505,7 +27505,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -27519,7 +27519,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -27531,7 +27531,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 @@ -27545,7 +27545,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -31058,7 +31058,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31072,7 +31072,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31084,7 +31084,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -31097,7 +31097,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -31143,7 +31143,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31151,7 +31151,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -31166,13 +31166,13 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31186,7 +31186,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 @@ -31205,7 +31205,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -31266,14 +31266,14 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -31281,7 +31281,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc @@ -31298,19 +31298,19 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v2, v2, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31325,7 +31325,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 @@ -31393,7 +31393,7 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -31401,14 +31401,14 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 @@ -31416,7 +31416,7 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc @@ -31434,26 +31434,26 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31468,7 +31468,7 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 @@ -31501,7 +31501,7 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 @@ -31557,7 +31557,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31571,7 +31571,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -31583,7 +31583,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -31596,7 +31596,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 @@ -31636,14 +31636,14 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31659,13 +31659,13 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31679,7 +31679,7 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 @@ -31698,7 +31698,7 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -31749,21 +31749,21 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31782,19 +31782,19 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31809,7 +31809,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 @@ -31869,14 +31869,14 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -31884,14 +31884,14 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31910,26 +31910,26 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -31944,7 +31944,7 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 @@ -31976,7 +31976,7 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 @@ -32063,7 +32063,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -32087,7 +32087,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -32100,7 +32100,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 @@ -32124,7 +32124,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX11-NEXT: v_cls_i32_e32 v3, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 @@ -32242,7 +32242,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 ; GFX8-NEXT: v_min_u32_e32 v7, v0, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -32254,7 +32254,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -32287,7 +32287,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -32297,7 +32297,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -32313,7 +32313,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v7, v3 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 @@ -32354,7 +32354,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: v_xor_b32_e32 v5, v2, v3 ; GFX11-NEXT: v_cls_i32_e32 v6, v1 ; GFX11-NEXT: v_cls_i32_e32 v7, v3 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX11-NEXT: v_add_nc_u32_e32 v6, -1, v6 @@ -32515,7 +32515,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -32539,7 +32539,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc @@ -32548,7 +32548,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc @@ -32583,7 +32583,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -32599,7 +32599,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v5 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -32609,7 +32609,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -32638,7 +32638,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_min_u32_e32 v8, v10, v8 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v6, v6, v7 ; GFX10-NEXT: v_min_u32_e32 v7, v11, v9 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] @@ -32822,7 +32822,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 ; GFX8-NEXT: v_min_u32_e32 v11, v4, v5 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v10 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -32844,7 +32844,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc @@ -32860,7 +32860,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 ; GFX8-NEXT: v_min_u32_e32 v9, v0, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -32872,7 +32872,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -32907,7 +32907,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 ; GFX9-NEXT: v_min_u32_e32 v11, v4, v5 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v10 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 @@ -32927,7 +32927,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -32943,7 +32943,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-NEXT: v_min_u32_e32 v9, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 @@ -32953,7 +32953,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -32989,7 +32989,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_min_u32_e32 v9, v12, v9 ; GFX10-NEXT: v_min_u32_e32 v11, v13, v14 @@ -33065,7 +33065,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_add_nc_u32_e32 v13, -1, v13 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_u32_e32 v9, v12, v9 @@ -33148,7 +33148,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33162,7 +33162,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33174,7 +33174,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -33187,7 +33187,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -33233,7 +33233,7 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -33241,7 +33241,7 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -33256,13 +33256,13 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33276,7 +33276,7 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 @@ -33295,7 +33295,7 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 @@ -33357,21 +33357,21 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc @@ -33388,19 +33388,19 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v2, v2, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33415,7 +33415,7 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 @@ -33484,14 +33484,14 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -33499,14 +33499,14 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc @@ -33524,26 +33524,26 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v5 ; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33558,7 +33558,7 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 @@ -33589,7 +33589,7 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 @@ -33652,7 +33652,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33666,7 +33666,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -33678,7 +33678,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -33691,7 +33691,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 @@ -33731,14 +33731,14 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -33754,13 +33754,13 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -33774,7 +33774,7 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 @@ -33793,7 +33793,7 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 @@ -33844,21 +33844,21 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -33877,19 +33877,19 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -33904,7 +33904,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 @@ -33964,14 +33964,14 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -33979,14 +33979,14 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -34005,26 +34005,26 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -34039,7 +34039,7 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 @@ -34071,7 +34071,7 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 @@ -34146,7 +34146,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -34166,7 +34166,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -34178,7 +34178,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 @@ -34198,7 +34198,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -34286,7 +34286,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 ; GFX8-NEXT: v_min_u32_e32 v7, 32, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 @@ -34298,7 +34298,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -34323,7 +34323,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 ; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 @@ -34333,7 +34333,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -34347,7 +34347,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -34380,7 +34380,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 @@ -34503,7 +34503,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -34522,7 +34522,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc @@ -34531,7 +34531,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc @@ -34558,7 +34558,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -34570,7 +34570,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 ; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v5 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v5 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 @@ -34580,7 +34580,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -34596,7 +34596,7 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-NEXT: v_ffbh_u32_e32 v6, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v8, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v7, v5 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v6, 32, v6 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX10-NEXT: v_min_u32_e32 v7, 32, v7 @@ -34741,7 +34741,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_ffbh_u32_e32 v4, v7 ; GFX8-NEXT: v_min_u32_e32 v11, 32, v4 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX8-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 @@ -34759,7 +34759,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc @@ -34771,7 +34771,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 ; GFX8-NEXT: v_min_u32_e32 v9, 32, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 @@ -34783,7 +34783,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -34810,7 +34810,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 ; GFX9-NEXT: v_min_u32_e32 v11, 32, v4 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_ffbh_u32_e32 v8, v1 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 @@ -34826,7 +34826,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -34838,7 +34838,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 ; GFX9-NEXT: v_min_u32_e32 v9, 32, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 @@ -34848,7 +34848,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -34865,7 +34865,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_ffbh_u32_e32 v10, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v11, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v9, v7 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX10-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX10-NEXT: v_min_u32_e32 v11, 32, v11 @@ -34925,7 +34925,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3 ; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 @@ -40088,7 +40088,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -40104,7 +40104,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -40118,7 +40118,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 ; GFX10-NEXT: v_and_or_b32 v1, v2, s4, 0x400000 @@ -40134,7 +40134,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 @@ -40206,7 +40206,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -40214,7 +40214,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -40234,13 +40234,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 @@ -40259,7 +40259,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1 @@ -40284,7 +40284,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 @@ -40375,7 +40375,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -40390,7 +40390,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -40398,7 +40398,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -40416,7 +40416,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -40429,13 +40429,13 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 @@ -40460,7 +40460,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1 ; GFX10-NEXT: v_and_or_b32 v3, v6, s4, 0x400000 @@ -40572,7 +40572,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -40581,7 +40581,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -40595,7 +40595,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -40603,7 +40603,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -40625,13 +40625,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 @@ -40645,13 +40645,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 @@ -40681,7 +40681,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff ; GFX10-NEXT: v_and_or_b32 v1, v6, s4, 0x400000 @@ -40717,7 +40717,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3 ; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4 @@ -40803,7 +40803,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -40813,7 +40813,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -40828,7 +40828,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v3, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -40837,7 +40837,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -40850,7 +40850,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_and_or_b32 v3, v0, s4, 0x400000 @@ -40873,7 +40873,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -40958,7 +40958,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -40971,7 +40971,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -40979,7 +40979,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc @@ -40989,7 +40989,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -41005,7 +41005,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -41014,7 +41014,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 @@ -41023,7 +41023,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -41032,7 +41032,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -41048,7 +41048,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1 @@ -41087,7 +41087,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -41210,7 +41210,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc @@ -41221,7 +41221,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -41231,7 +41231,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc @@ -41243,7 +41243,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -41251,7 +41251,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -41261,7 +41261,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -41278,7 +41278,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -41287,7 +41287,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -41296,7 +41296,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -41305,7 +41305,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -41314,7 +41314,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -41323,7 +41323,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -41343,7 +41343,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -41492,7 +41492,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc @@ -41505,7 +41505,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -41513,7 +41513,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc @@ -41523,7 +41523,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc @@ -41533,7 +41533,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc @@ -41545,7 +41545,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -41553,7 +41553,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc @@ -41563,7 +41563,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc @@ -41581,7 +41581,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 @@ -41590,7 +41590,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v6 +; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 @@ -41599,7 +41599,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -41608,7 +41608,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -41617,7 +41617,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -41626,7 +41626,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 @@ -41635,7 +41635,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -41644,7 +41644,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -41667,7 +41667,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: s_brev_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_and_or_b32 v3, v6, s4, 0x400000 ; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7 @@ -41736,7 +41736,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index cfe1e46bf2c5e..d35871e3774de 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -790,7 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc @@ -806,7 +806,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index b88aa1ce33fb3..9142858806f1c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1524,7 +1524,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -1608,7 +1608,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -1632,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_brev_b32 s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s2, -4 ; GFX10-NEXT: s_mov_b32 s1, s3 @@ -1673,7 +1673,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_brev_b32 s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0xff800000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s2, -4 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -1786,7 +1786,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4 ; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -1828,7 +1828,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -1854,7 +1854,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_brev_b32 s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s2, -4 ; GFX10-NEXT: s_mov_b32 s1, s3 @@ -1895,7 +1895,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_brev_b32 s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0xff800000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s2, -4 diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index ad788b8d55014..6a7fb7142c293 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -912,7 +912,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 ; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec @@ -934,7 +934,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 ; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec @@ -956,7 +956,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 ; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec @@ -978,7 +978,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 ; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index e906b5327c362..48ae98f125bf4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1413,7 +1413,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 ; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 +; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -1451,7 +1451,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 @@ -1560,7 +1560,7 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_add_f32_e32 v4, 4.0, v4 ; VI-NEXT: v_bfe_u32 v6, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v4 +; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 @@ -1597,7 +1597,7 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 590b40960faab..672c93b6adf7f 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4262,20 +4262,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 ; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2 ; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v7 +; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2 ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0x80000000, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2 ; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2 ; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc @@ -4298,20 +4298,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2 ; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v3 +; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v2 +; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2 ; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v4 +; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2 ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc @@ -4332,7 +4332,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX10-NEXT: s_brev_b32 s2, 1 +; GFX10-NEXT: s_mov_b32 s2, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] @@ -4416,7 +4416,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX11-NEXT: s_brev_b32 s0, 1 +; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] From d17eade22ab9a65144a2bbd538f47924eed6b87d Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Wed, 21 Feb 2024 14:28:34 -0800 Subject: [PATCH 155/351] Do not call disable / enable on null depot (#82542) depot can be null if allocation_ring_buffer_size=0 --- compiler-rt/lib/scudo/standalone/combined.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 080ba42ad4449..f3c3d757c9f12 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -688,12 +688,14 @@ class Allocator { Quarantine.disable(); Primary.disable(); Secondary.disable(); - Depot->disable(); + if (Depot) + Depot->disable(); } void enable() NO_THREAD_SAFETY_ANALYSIS { initThreadMaybe(); - Depot->enable(); + if (Depot) + Depot->enable(); Secondary.enable(); Primary.enable(); Quarantine.enable(); From be36812fb7cb3fca05f20865e062c966a14dbfdc Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 21 Feb 2024 22:43:10 +0000 Subject: [PATCH 156/351] [TargetLowering] Be more efficient in fp -> bf16 NaN conversions We can avoid masking completely as it is OK (and probably preferable) to bring over some of the existant NaN payload. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 10 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 7262 ++++++++--------- .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 6 +- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 30 +- .../isel-amdgpu-cs-chain-preserve-cc.ll | 20 +- llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 12 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 72 +- 7 files changed, 3238 insertions(+), 4174 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a4c5167ade376..07fb89127a737 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10948,12 +10948,10 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const { Op = expandRoundInexactToOdd(F32, Op, dl, DAG); Op = DAG.getNode(ISD::BITCAST, dl, I32, Op); - // Extract the sign bit and exponent. - SDValue SignBitAndExponentField = DAG.getNode( - ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32)); - // Set the quiet bit. - SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField, - DAG.getConstant(0x400000, dl, I32)); + // Conversions should set NaN's quiet bit. This also prevents NaNs from + // turning into infinities. + SDValue NaN = + DAG.getNode(ISD::OR, dl, I32, Op, DAG.getConstant(0x400000, dl, I32)); // Factor in the contribution of the low 16 bits. SDValue One = DAG.getConstant(1, dl, I32); diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 63a09e49e0051..8ec7dfd93cd09 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2182,9 +2182,8 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2199,9 +2198,8 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off @@ -2212,10 +2210,9 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo @@ -2226,10 +2223,9 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -2294,7 +2290,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v5 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2323,7 +2318,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4 ; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v5 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off @@ -2343,14 +2337,13 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5 ; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] ; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 ; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2369,9 +2362,8 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5] ; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo @@ -2380,7 +2372,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v5, v5, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8999,8 +8991,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -9014,9 +9005,8 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -9027,10 +9017,9 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -9042,11 +9031,10 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -9104,16 +9092,14 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -9126,20 +9112,18 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -9153,14 +9137,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -9176,16 +9159,15 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -9255,8 +9237,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -9268,16 +9249,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -9293,27 +9272,24 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -9331,18 +9307,17 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -9426,17 +9401,15 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -9447,16 +9420,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -9471,38 +9442,34 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -9523,31 +9490,30 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_add_f32_e32 v3, v7, v6 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v4bf16: @@ -9555,45 +9521,42 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %op @@ -9717,17 +9680,15 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 @@ -9738,16 +9699,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 @@ -9758,16 +9717,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 @@ -9778,16 +9735,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -9806,74 +9761,66 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -9890,62 +9837,61 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_add_f32_e32 v8, v9, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 -; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 -; GFX10-NEXT: v_add_f32_e32 v9, v11, v9 -; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX10-NEXT: v_add_f32_e32 v7, v10, v9 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_add_f32_e32 v6, v10, v6 +; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_add_f32_e32 v6, v11, v6 -; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_f32_e32 v5, v15, v13 -; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 -; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -9953,81 +9899,80 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fadd_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 -; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_f32_e32 v9, v11, v9 -; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_add_f32_e32 v7, v10, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v6, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo -; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 ; GFX11-NEXT: v_add_f32_e32 v5, v15, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -10036,9 +9981,9 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fadd <8 x bfloat> %a, %b ret <8 x bfloat> %op @@ -10263,16 +10208,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 @@ -10283,16 +10226,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 @@ -10303,16 +10244,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 @@ -10323,16 +10262,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 @@ -10343,16 +10280,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 @@ -10363,16 +10298,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 @@ -10383,16 +10316,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 @@ -10403,16 +10334,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -10439,146 +10368,130 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -10599,27 +10512,26 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_add_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_add_f32_e32 v7, v7, v15 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX10-NEXT: v_add_f32_e32 v17, v18, v17 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_add_f32_e32 v6, v6, v14 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo @@ -10633,7 +10545,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_add_f32_e32 v5, v5, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -10643,10 +10555,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v13, v19, v18 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v4, v4, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo @@ -10656,14 +10568,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo -; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_add_f32_e32 v12, v18, v12 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 @@ -10675,12 +10587,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -10689,8 +10601,8 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -10702,17 +10614,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v9 ; GFX10-NEXT: v_add_f32_e32 v9, v22, v20 -; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff @@ -10741,12 +10653,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v17, v18, v17 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v14 @@ -10759,13 +10670,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_add_f32_e32 v7, v7, v15 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff -; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 @@ -10787,32 +10698,32 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_add_f32_e32 v12, v18, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 @@ -10828,7 +10739,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 -; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v11 @@ -10838,13 +10749,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -10861,13 +10772,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_add_f32_e32 v9, v22, v20 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 @@ -11434,16 +11345,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 @@ -11465,29 +11374,25 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_add_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 @@ -11498,16 +11403,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 @@ -11518,16 +11421,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 @@ -11538,16 +11439,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 @@ -11558,16 +11457,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 @@ -11578,16 +11475,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 @@ -11598,16 +11493,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 @@ -11618,16 +11511,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -11638,16 +11529,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 @@ -11658,16 +11547,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 @@ -11678,16 +11565,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 @@ -11698,16 +11583,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 @@ -11718,16 +11601,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 @@ -11738,16 +11619,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -11788,292 +11667,260 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 +; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, v32, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v33, v33, v34 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 -; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -12098,7 +11945,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-LABEL: v_fadd_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 @@ -12163,7 +12010,6 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -12179,10 +12025,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_add_f32_e32 v17, v26, v50 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 +; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 @@ -12220,28 +12066,28 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 @@ -12260,12 +12106,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 @@ -12282,51 +12128,51 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 @@ -12334,7 +12180,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 @@ -12358,14 +12204,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX10-NEXT: v_add_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX10-NEXT: v_add_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_add_f32_e32 v15, v15, v18 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff @@ -12378,212 +12224,219 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-LABEL: v_fadd_v32bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 +; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_add_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11-NEXT: v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_add_f32_e32 v24, v64, v55 -; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX11-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83 +; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 -; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX11-NEXT: v_add_f32_e32 v18, v84, v83 -; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff -; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX11-NEXT: v_dual_add_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v4, v4, v20 ; GFX11-NEXT: v_add_f32_e32 v20, v80, v71 -; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_add_f32_e32 v25, v54, v53 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11-NEXT: v_add_f32_e32 v28, v48, v39 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_add_f32 v26, v52, v51 -; GFX11-NEXT: v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_add_f32_e32 v26, v52, v51 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 +; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX11-NEXT: v_add_f32_e32 v28, v48, v39 ; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33 -; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 -; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff -; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff -; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff -; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff -; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 -; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 -; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 +; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 -; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 @@ -12622,22 +12475,21 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX11-NEXT: v_add_f32_e32 v17, v32, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 ; GFX11-NEXT: v_add_f32_e32 v15, v15, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -12678,8 +12530,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12692,9 +12543,8 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12704,10 +12554,9 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -12718,11 +12567,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -12761,8 +12609,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12775,9 +12622,8 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12787,10 +12633,9 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -12801,11 +12646,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -12849,8 +12693,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12864,9 +12707,8 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12877,10 +12719,9 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -12892,11 +12733,10 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -12954,16 +12794,14 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -12976,20 +12814,18 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -13003,14 +12839,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -13026,16 +12861,15 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -13105,8 +12939,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -13118,16 +12951,14 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13143,27 +12974,24 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -13181,18 +13009,17 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -13276,17 +13103,15 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -13297,16 +13122,14 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -13321,38 +13144,34 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -13373,31 +13192,30 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fsub_v4bf16: @@ -13405,45 +13223,42 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fsub <4 x bfloat> %a, %b ret <4 x bfloat> %op @@ -13481,8 +13296,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13496,9 +13310,8 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13509,10 +13322,9 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -13524,11 +13336,10 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -13586,16 +13397,14 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13608,20 +13417,18 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -13635,14 +13442,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -13658,16 +13464,15 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -13737,8 +13542,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -13750,16 +13554,14 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -13775,27 +13577,24 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -13813,18 +13612,17 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -13908,17 +13706,15 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -13929,16 +13725,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -13953,38 +13747,34 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -14005,31 +13795,30 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v4bf16: @@ -14037,45 +13826,42 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-NEXT: v_mul_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <4 x bfloat> %a, %b ret <4 x bfloat> %op @@ -14199,17 +13985,15 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 @@ -14220,16 +14004,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 @@ -14240,16 +14022,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 @@ -14260,16 +14040,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -14288,74 +14066,66 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -14372,62 +14142,61 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 -; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 -; GFX10-NEXT: v_mul_f32_e32 v9, v11, v9 -; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX10-NEXT: v_mul_f32_e32 v7, v10, v9 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_mul_f32_e32 v6, v10, v6 +; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_mul_f32_e32 v6, v11, v6 -; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13 -; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 -; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -14435,81 +14204,80 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fmul_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 -; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_f32_e32 v9, v11, v9 -; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 -; GFX11-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_mul_f32_e32 v7, v10, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_f32_e32 v6, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo -; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 ; GFX11-NEXT: v_mul_f32_e32 v5, v15, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -14518,9 +14286,9 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = fmul <8 x bfloat> %a, %b ret <8 x bfloat> %op @@ -14745,16 +14513,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 @@ -14765,16 +14531,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 @@ -14785,16 +14549,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 @@ -14805,16 +14567,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 @@ -14825,16 +14585,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 @@ -14845,16 +14603,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 @@ -14865,16 +14621,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 @@ -14885,16 +14639,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -14921,146 +14673,130 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -15081,27 +14817,26 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo @@ -15115,7 +14850,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -15125,10 +14860,10 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo @@ -15138,14 +14873,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo -; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 @@ -15157,12 +14892,12 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -15171,8 +14906,8 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -15184,17 +14919,17 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9 ; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20 -; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff @@ -15223,12 +14958,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mul_f32_e32 v17, v18, v17 ; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14 @@ -15241,13 +14975,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff -; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 @@ -15269,32 +15003,32 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_mul_f32_e32 v12, v18, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 @@ -15310,7 +15044,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 -; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11 @@ -15320,13 +15054,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -15343,13 +15077,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_mul_f32_e32 v9, v22, v20 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 @@ -15916,16 +15650,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 @@ -15947,29 +15679,25 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 @@ -15980,16 +15708,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 @@ -16000,16 +15726,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 @@ -16020,16 +15744,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 @@ -16040,16 +15762,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 @@ -16060,16 +15780,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 @@ -16080,16 +15798,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 @@ -16100,16 +15816,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -16120,16 +15834,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 @@ -16140,16 +15852,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 @@ -16160,16 +15870,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 @@ -16180,16 +15888,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 @@ -16200,16 +15906,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 @@ -16220,16 +15924,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -16270,292 +15972,260 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 +; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_mul_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 -; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_mul_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -16580,7 +16250,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-LABEL: v_fmul_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 @@ -16645,7 +16315,6 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -16661,10 +16330,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 +; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 @@ -16702,28 +16371,28 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 @@ -16742,12 +16411,12 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 @@ -16764,51 +16433,51 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 @@ -16816,7 +16485,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 @@ -16840,14 +16509,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX10-NEXT: v_mul_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff @@ -16860,212 +16529,219 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-LABEL: v_fmul_v32bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 +; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_mul_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_mul_f32_e32 v24, v64, v55 -; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX11-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83 +; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 -; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 -; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX11-NEXT: v_mul_f32_e32 v18, v84, v83 -; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff -; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX11-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v20 ; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71 -; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_mul_f32_e32 v25, v54, v53 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_mul_f32 v26, v52, v51 -; GFX11-NEXT: v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 +; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39 ; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33 -; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 -; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff -; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff -; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff -; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff -; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 -; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 -; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 +; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 -; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 @@ -17104,22 +16780,21 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX11-NEXT: v_mul_f32_e32 v17, v32, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 ; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -17194,9 +16869,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -17220,9 +16894,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -17235,7 +16908,6 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -17246,7 +16918,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -17258,7 +16930,6 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 @@ -17277,7 +16948,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -17637,8 +17308,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -17652,9 +17322,8 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -17665,10 +17334,9 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -17680,11 +17348,10 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -17750,16 +17417,14 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -17772,20 +17437,18 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -17799,14 +17462,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -17822,16 +17484,15 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -17913,8 +17574,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -17926,16 +17586,14 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -17951,27 +17609,24 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -17989,18 +17644,17 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -18100,17 +17754,15 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -18121,16 +17773,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -18145,38 +17795,34 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -18197,31 +17843,30 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_min_f32_e32 v3, v7, v6 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v4bf16: @@ -18229,45 +17874,42 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op @@ -18423,17 +18065,15 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 @@ -18444,16 +18084,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 @@ -18464,16 +18102,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 @@ -18484,16 +18120,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -18512,74 +18146,66 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -18596,62 +18222,61 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_min_f32_e32 v8, v9, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 -; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 -; GFX10-NEXT: v_min_f32_e32 v9, v11, v9 -; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX10-NEXT: v_min_f32_e32 v7, v10, v9 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff ; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_min_f32_e32 v6, v10, v6 +; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_min_f32_e32 v6, v11, v6 -; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_min_f32_e32 v5, v15, v13 -; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 -; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -18659,81 +18284,80 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minnum_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 -; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_min_f32_e32 v9, v11, v9 -; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_min_f32_e32 v7, v10, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_f32_e32 v6, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo -; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 ; GFX11-NEXT: v_min_f32_e32 v5, v15, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -18742,9 +18366,9 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op @@ -19033,16 +18657,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 @@ -19053,16 +18675,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 @@ -19073,16 +18693,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 @@ -19093,16 +18711,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 @@ -19113,16 +18729,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 @@ -19133,16 +18747,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 @@ -19153,16 +18765,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 @@ -19173,16 +18783,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -19209,146 +18817,130 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -19369,27 +18961,26 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_min_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX10-NEXT: v_min_f32_e32 v17, v18, v17 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo @@ -19403,7 +18994,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_min_f32_e32 v5, v5, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -19413,10 +19004,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo ; GFX10-NEXT: v_min_f32_e32 v13, v19, v18 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX10-NEXT: v_min_f32_e32 v4, v4, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo @@ -19426,14 +19017,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo -; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_min_f32_e32 v12, v18, v12 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX10-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 @@ -19445,12 +19036,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 ; GFX10-NEXT: v_min_f32_e32 v2, v2, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -19459,8 +19050,8 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -19472,17 +19063,17 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v9 ; GFX10-NEXT: v_min_f32_e32 v9, v22, v20 -; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v8 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff @@ -19511,12 +19102,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_f32_e32 v17, v18, v17 ; GFX11-NEXT: v_min_f32_e32 v6, v6, v14 @@ -19529,13 +19119,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff -; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 @@ -19557,32 +19147,32 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_min_f32_e32 v12, v18, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 @@ -19598,7 +19188,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 -; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_min_f32_e32 v3, v3, v11 @@ -19608,13 +19198,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -19631,13 +19221,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_min_f32_e32 v9, v22, v20 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 @@ -20332,16 +19922,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 @@ -20363,29 +19951,25 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_min_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 @@ -20396,16 +19980,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 @@ -20416,16 +19998,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 @@ -20436,16 +20016,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 @@ -20456,16 +20034,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 @@ -20476,16 +20052,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 @@ -20496,16 +20070,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 @@ -20516,16 +20088,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -20536,16 +20106,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 @@ -20556,16 +20124,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 @@ -20576,16 +20142,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 @@ -20596,16 +20160,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 @@ -20616,16 +20178,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 @@ -20636,16 +20196,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -20686,292 +20244,260 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 +; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_min_f32_e32 v32, v32, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_min_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_min_f32_e32 v33, v33, v34 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 -; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_min_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -20996,7 +20522,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-LABEL: v_minnum_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 @@ -21061,7 +20587,6 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -21077,10 +20602,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_min_f32_e32 v17, v26, v50 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 +; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 @@ -21118,28 +20643,28 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 @@ -21158,12 +20683,12 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 @@ -21180,51 +20705,51 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 @@ -21232,7 +20757,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 @@ -21256,14 +20781,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX10-NEXT: v_min_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX10-NEXT: v_min_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_min_f32_e32 v15, v15, v18 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff @@ -21276,212 +20801,219 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-LABEL: v_minnum_v32bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 +; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_min_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_min_f32_e32 v24, v64, v55 -; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX11-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83 +; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 -; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX11-NEXT: v_min_f32_e32 v18, v84, v83 -; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff -; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX11-NEXT: v_dual_min_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX11-NEXT: v_min_f32_e32 v20, v80, v71 -; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_min_f32_e32 v25, v54, v53 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11-NEXT: v_min_f32_e32 v28, v48, v39 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_min_f32 v26, v52, v51 -; GFX11-NEXT: v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_min_f32_e32 v26, v52, v51 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 +; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX11-NEXT: v_min_f32_e32 v28, v48, v39 ; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33 -; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 -; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff -; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff -; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff -; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff -; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 -; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 -; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 +; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 -; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 @@ -21520,22 +21052,21 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX11-NEXT: v_min_f32_e32 v17, v32, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 ; GFX11-NEXT: v_min_f32_e32 v15, v15, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -21594,8 +21125,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -21609,9 +21139,8 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -21622,10 +21151,9 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -21637,11 +21165,10 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -21707,16 +21234,14 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -21729,20 +21254,18 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -21756,14 +21279,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -21779,16 +21301,15 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -21870,8 +21391,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -21883,16 +21403,14 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -21908,27 +21426,24 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -21946,18 +21461,17 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -22057,17 +21571,15 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -22078,16 +21590,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -22102,38 +21612,34 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -22154,31 +21660,30 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_max_f32_e32 v3, v7, v6 +; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_or_b32 v6, v4, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX10-NEXT: v_and_or_b32 v6, v5, s4, 0x400000 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v4bf16: @@ -22186,45 +21691,42 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 -; GFX11-NEXT: v_and_or_b32 v6, v4, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v4, v7, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v5, s0, 0x400000 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op @@ -22380,17 +21882,15 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 @@ -22401,16 +21901,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 @@ -22421,16 +21919,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 @@ -22441,16 +21937,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -22469,74 +21963,66 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v7 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -22553,62 +22039,61 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_max_f32_e32 v8, v9, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 -; GFX10-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v8, s4, 0x400000 -; GFX10-NEXT: v_max_f32_e32 v9, v11, v9 -; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX10-NEXT: v_max_f32_e32 v7, v10, v9 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX10-NEXT: v_add3_u32 v10, v10, v8, 0x7fff ; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_and_or_b32 v12, v9, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX10-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 ; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX10-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_max_f32_e32 v6, v10, v6 +; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_max_f32_e32 v6, v11, v6 -; GFX10-NEXT: v_add3_u32 v9, v13, v2, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX10-NEXT: v_and_or_b32 v11, v2, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 ; GFX10-NEXT: v_max_f32_e32 v5, v15, v13 -; GFX10-NEXT: v_and_or_b32 v14, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v15, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v11, v5, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo -; GFX10-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v0, s4, 0x400000 -; GFX10-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -22616,81 +22101,80 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maxnum_v8bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6 -; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_max_f32_e32 v9, v11, v9 -; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v12, v9, s0, 0x400000 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_and_or_b32 v7, v8, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX11-NEXT: v_add3_u32 v8, v8, v9, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v9, v13, v2, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_max_f32_e32 v7, v10, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v14, v3, s0, 0x400000 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX11-NEXT: v_add3_u32 v10, v11, v3, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_and_or_b32 v11, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_max_f32_e32 v6, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo -; GFX11-NEXT: v_and_or_b32 v9, v6, s0, 0x400000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff -; GFX11-NEXT: v_perm_b32 v2, v2, v8, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 ; GFX11-NEXT: v_max_f32_e32 v5, v15, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v15, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v5, s0, 0x400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v9, v12, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff ; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v13, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -22699,9 +22183,9 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v7, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op @@ -22990,16 +22474,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 @@ -23010,16 +22492,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 @@ -23030,16 +22510,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 @@ -23050,16 +22528,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 @@ -23070,16 +22546,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 @@ -23090,16 +22564,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 @@ -23110,16 +22582,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 @@ -23130,16 +22600,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: v_and_b32_e32 v17, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -23166,146 +22634,130 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v16 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 ; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 ; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 ; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v13 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 ; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 ; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v12 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 ; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v11 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v10 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 ; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xff800000, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 ; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -23326,27 +22778,26 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_max_f32_e32 v16, v17, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX10-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v16, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX10-NEXT: v_max_f32_e32 v17, v18, v17 ; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff ; GFX10-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v7, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GFX10-NEXT: v_and_or_b32 v16, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo @@ -23360,7 +22811,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_max_f32_e32 v5, v5, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo ; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v13, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 ; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -23370,10 +22821,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo ; GFX10-NEXT: v_max_f32_e32 v13, v19, v18 ; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v17, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo @@ -23383,14 +22834,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff ; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo -; GFX10-NEXT: v_and_or_b32 v19, v13, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_max_f32_e32 v12, v18, v12 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX10-NEXT: v_and_or_b32 v22, v12, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 @@ -23402,12 +22853,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_and_or_b32 v17, v18, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -23416,8 +22867,8 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX10-NEXT: v_and_or_b32 v18, v2, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -23429,17 +22880,17 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v9 ; GFX10-NEXT: v_max_f32_e32 v9, v22, v20 -; GFX10-NEXT: v_and_or_b32 v22, v19, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v8 ; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX10-NEXT: v_and_or_b32 v24, v9, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v25, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX10-NEXT: v_and_or_b32 v22, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff @@ -23468,12 +22919,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_and_or_b32 v20, v16, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_max_f32_e32 v17, v18, v17 ; GFX11-NEXT: v_max_f32_e32 v6, v6, v14 @@ -23486,13 +22936,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 ; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff -; GFX11-NEXT: v_and_or_b32 v16, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 ; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 @@ -23514,32 +22964,32 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX11-NEXT: v_and_or_b32 v13, v6, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18 ; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff -; GFX11-NEXT: v_and_or_b32 v18, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v5, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-NEXT: v_max_f32_e32 v12, v18, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff ; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; GFX11-NEXT: v_and_or_b32 v22, v12, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff -; GFX11-NEXT: v_and_or_b32 v19, v13, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX11-NEXT: v_and_or_b32 v21, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo ; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 @@ -23555,7 +23005,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 -; GFX11-NEXT: v_and_or_b32 v17, v18, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_max_f32_e32 v3, v3, v11 @@ -23565,13 +23015,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff -; GFX11-NEXT: v_and_or_b32 v20, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo ; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 ; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-NEXT: v_and_or_b32 v18, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo ; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff @@ -23588,13 +23038,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11-NEXT: v_max_f32_e32 v9, v22, v20 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v22, v19, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v25, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX11-NEXT: v_and_or_b32 v24, v9, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 ; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo -; GFX11-NEXT: v_and_or_b32 v22, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 @@ -24289,16 +23739,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX8-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 ; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 -; GFX8-NEXT: v_and_b32_e32 v32, 0xff800000, v14 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 @@ -24320,29 +23768,25 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_max_f32_e32 v30, v15, v30 ; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v33 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v30 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 ; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 @@ -24353,16 +23797,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v29 ; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 @@ -24373,16 +23815,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 ; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v11 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 @@ -24393,16 +23833,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 ; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v10 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 @@ -24413,16 +23851,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 ; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v9 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 @@ -24433,16 +23869,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 ; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v8 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 @@ -24453,16 +23887,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 ; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v7 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 @@ -24473,16 +23905,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -24493,16 +23923,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 @@ -24513,16 +23941,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 @@ -24533,16 +23959,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 @@ -24553,16 +23977,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 @@ -24573,16 +23995,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 @@ -24593,16 +24013,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 -; GFX8-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 -; GFX8-NEXT: v_and_b32_e32 v33, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -24643,292 +24061,260 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v31 ; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v32, 0xff800000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc ; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 ; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v30 +; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_max_f32_e32 v32, v32, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_max_f32_e32 v32, v32, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_max_f32_e32 v33, v33, v34 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v32 -; GFX9-NEXT: v_add3_u32 v15, v15, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; GFX9-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v13 -; GFX9-NEXT: v_add3_u32 v32, v32, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v32, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_max_f32_e32 v32, v33, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v32 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v12 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v28 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v27 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 ; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v26 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 ; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v25 ; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 ; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v24 ; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 ; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v23 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v22 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v21 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 ; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v20 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 ; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v19 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 ; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v18 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 ; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v34, 0xff800000, v17 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 ; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v34 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v33, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v33 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -24953,7 +24339,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-LABEL: v_maxnum_v32bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 ; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 ; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 @@ -25018,7 +24404,6 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1 ; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1 ; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1 -; GFX10-NEXT: s_mov_b32 s23, 0xff800000 ; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 ; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 ; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 @@ -25034,10 +24419,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 ; GFX10-NEXT: v_max_f32_e32 v17, v26, v50 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX10-NEXT: v_and_or_b32 v54, v39, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v64, v11, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v66, v49, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v68, v10, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11 +; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49 +; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39 ; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11 @@ -25075,28 +24460,28 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1 ; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1 ; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1 -; GFX10-NEXT: v_and_or_b32 v48, v37, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v52, v12, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37 ; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12 ; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18 ; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff -; GFX10-NEXT: v_and_or_b32 v18, v18, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1 ; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v1, v1, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17 ; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff -; GFX10-NEXT: v_and_or_b32 v17, v17, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0 ; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v0, v0, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v26, v33, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v28, v14, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v30, v35, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v36, v13, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33 +; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14 +; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35 +; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 @@ -25115,12 +24500,12 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11 -; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 ; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX10-NEXT: v_and_or_b32 v27, v51, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51 ; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1 ; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1 -; GFX10-NEXT: v_and_or_b32 v67, v24, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24 ; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff ; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1 @@ -25137,51 +24522,51 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19 ; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff -; GFX10-NEXT: v_and_or_b32 v19, v19, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19 ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2 ; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v2, v2, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 -; GFX10-NEXT: v_and_or_b32 v34, v9, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v50, v25, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25 ; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 ; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff -; GFX10-NEXT: v_and_or_b32 v35, v7, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25 ; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7 ; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v51, v6, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6 ; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6 ; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff ; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21 ; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff -; GFX10-NEXT: v_and_or_b32 v21, v21, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21 ; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4 ; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v4, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20 ; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff -; GFX10-NEXT: v_and_or_b32 v20, v20, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20 ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3 ; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v3, v3, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9 -; GFX10-NEXT: v_and_or_b32 v55, v8, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8 ; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8 ; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff -; GFX10-NEXT: v_and_or_b32 v53, v23, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23 ; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23 ; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff ; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v5, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6 @@ -25189,7 +24574,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22 ; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff -; GFX10-NEXT: v_and_or_b32 v22, v22, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 @@ -25213,14 +24598,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302 ; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX10-NEXT: v_max_f32_e32 v17, v32, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; GFX10-NEXT: v_max_f32_e32 v17, v31, v17 ; GFX10-NEXT: v_max_f32_e32 v15, v15, v18 ; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX10-NEXT: v_and_or_b32 v20, v17, s23, 0x400000 -; GFX10-NEXT: v_and_or_b32 v21, v15, s23, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15 ; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff @@ -25233,212 +24618,219 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-LABEL: v_maxnum_v32bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 +; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 +; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 ; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-NEXT: v_and_or_b32 v144, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8 -; GFX11-NEXT: v_dual_max_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 -; GFX11-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26 -; GFX11-NEXT: v_max_f32_e32 v24, v64, v55 -; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 +; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 ; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 +; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 -; GFX11-NEXT: v_and_or_b32 v86, v24, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v96, v7, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX11-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 +; GFX11-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83 +; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 +; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 +; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 +; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 ; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 -; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11 -; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10 -; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX11-NEXT: v_max_f32_e32 v18, v84, v83 -; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 -; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 -; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 -; GFX11-NEXT: v_and_or_b32 v84, v8, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v98, v23, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v100, v6, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v112, v5, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v114, v21, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff -; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff +; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v12 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; GFX11-NEXT: v_dual_max_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 +; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX11-NEXT: v_max_f32_e32 v20, v80, v71 -; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11-NEXT: v_max_f32_e32 v25, v54, v53 +; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 ; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13 -; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30 -; GFX11-NEXT: v_max_f32_e32 v28, v48, v39 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14 -; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14 -; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_max_f32 v26, v52, v51 -; GFX11-NEXT: v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_max_f32_e32 v26, v52, v51 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 +; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 +; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_max_f32_e32 v29, v38, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX11-NEXT: v_max_f32_e32 v28, v48, v39 ; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33 -; GFX11-NEXT: v_and_or_b32 v48, v13, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 -; GFX11-NEXT: v_and_or_b32 v36, v14, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 -; GFX11-NEXT: v_and_or_b32 v34, v33, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 ; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff -; GFX11-NEXT: v_and_or_b32 v38, v30, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff -; GFX11-NEXT: v_and_or_b32 v50, v29, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 +; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 ; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 ; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff -; GFX11-NEXT: v_and_or_b32 v52, v12, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 ; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff -; GFX11-NEXT: v_and_or_b32 v54, v28, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 -; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-NEXT: v_and_or_b32 v64, v11, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 ; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff -; GFX11-NEXT: v_and_or_b32 v66, v27, s0, 0x400000 ; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 ; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 ; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff -; GFX11-NEXT: v_and_or_b32 v68, v10, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 ; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 ; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff -; GFX11-NEXT: v_and_or_b32 v70, v26, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 -; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-NEXT: v_and_or_b32 v80, v9, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 -; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff -; GFX11-NEXT: v_and_or_b32 v82, v25, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 +; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff -; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 -; GFX11-NEXT: v_and_or_b32 v102, v22, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 +; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff +; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 ; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff -; GFX11-NEXT: v_and_or_b32 v116, v4, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 -; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 +; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 +; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-NEXT: v_and_or_b32 v118, v20, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 -; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff -; GFX11-NEXT: v_and_or_b32 v130, v19, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 +; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 -; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff -; GFX11-NEXT: v_and_or_b32 v134, v18, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 +; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff -; GFX11-NEXT: v_and_or_b32 v146, v17, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff +; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 ; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 -; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v33, v0, s0, 0x400000 +; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v132, v2, s0, 0x400000 +; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff -; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v128, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 ; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff +; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff -; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 +; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 -; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 ; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 @@ -25477,22 +24869,21 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 -; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 -; GFX11-NEXT: v_max_f32_e32 v17, v32, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 ; GFX11-NEXT: v_max_f32_e32 v15, v15, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX11-NEXT: v_and_or_b32 v20, v17, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-NEXT: v_and_or_b32 v21, v15, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 ; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff ; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -25586,8 +24977,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25617,9 +25007,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25641,13 +25030,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4 ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -25674,9 +25062,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 ; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo @@ -25684,7 +25071,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -25724,8 +25111,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25738,9 +25124,8 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25750,10 +25135,9 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -25764,11 +25148,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -25816,8 +25199,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25831,9 +25213,8 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25844,11 +25225,10 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v3, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo @@ -25947,8 +25327,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25978,9 +25357,8 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -25990,7 +25368,6 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -26004,7 +25381,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26015,7 +25392,6 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo @@ -26035,7 +25411,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -26097,8 +25473,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26119,9 +25494,8 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26131,7 +25505,6 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo @@ -26139,7 +25512,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26150,7 +25523,6 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo @@ -26161,7 +25533,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -26257,8 +25629,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26288,9 +25659,8 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26300,7 +25670,6 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -26314,7 +25683,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26325,7 +25694,6 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo @@ -26345,7 +25713,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -26442,8 +25810,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26473,9 +25840,8 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26485,7 +25851,6 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1 @@ -26500,7 +25865,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26511,7 +25876,6 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1 @@ -26532,7 +25896,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -26594,8 +25958,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26616,9 +25979,8 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26628,7 +25990,6 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo @@ -26636,7 +25997,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26647,7 +26008,6 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo @@ -26658,7 +26018,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -26752,8 +26112,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26783,9 +26142,8 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26795,7 +26153,6 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 ; GFX10-NEXT: v_rndne_f32_e32 v2, v1 @@ -26810,7 +26167,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26821,7 +26178,6 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX11-NEXT: v_rndne_f32_e32 v2, v1 @@ -26842,7 +26198,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -26882,8 +26238,7 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26896,9 +26251,8 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX9-NEXT: v_ceil_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26908,10 +26262,9 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ceil_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -26922,11 +26275,10 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ceil_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -26967,8 +26319,7 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26981,9 +26332,8 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX9-NEXT: v_trunc_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -26993,10 +26343,9 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27007,11 +26356,10 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -27052,8 +26400,7 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27066,9 +26413,8 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27078,10 +26424,9 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27092,11 +26437,10 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -27137,8 +26481,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27151,9 +26494,8 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27163,10 +26505,9 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27177,11 +26518,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -27240,8 +26580,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27260,9 +26599,8 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27276,11 +26614,10 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27297,13 +26634,12 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -27343,8 +26679,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27357,9 +26692,8 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27369,10 +26703,9 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_rndne_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27383,11 +26716,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rndne_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -27428,8 +26760,7 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27442,9 +26773,8 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX9-NEXT: v_floor_f32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27454,10 +26784,9 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_floor_f32_e32 v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27468,11 +26797,10 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_floor_f32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -27505,8 +26833,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27519,9 +26846,8 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -27531,10 +26857,9 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -27545,11 +26870,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -31058,9 +30382,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -31072,9 +30395,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -31084,9 +30406,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -31097,11 +30418,10 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -31143,16 +30463,14 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -31166,15 +30484,13 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -31186,12 +30502,11 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo @@ -31205,16 +30520,15 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -31263,26 +30577,22 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -31294,25 +30604,22 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v4 -; GFX9-NEXT: v_add3_u32 v2, v2, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -31325,24 +30632,23 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = sitofp <3 x i16> %x to <3 x bfloat> @@ -31393,31 +30699,27 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -31430,32 +30732,28 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v5 -; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -31468,30 +30766,29 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v6, v3, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v11, v0, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -31499,37 +30796,39 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_and_or_b32 v6, v3, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v0, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp <4 x i16> %x to <4 x bfloat> @@ -31557,9 +30856,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -31571,9 +30869,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -31583,9 +30880,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -31596,10 +30892,9 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -31636,16 +30931,14 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -31659,15 +30952,13 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -31679,12 +30970,11 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo @@ -31698,13 +30988,12 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -31749,23 +31038,20 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -31782,21 +31068,18 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -31809,24 +31092,23 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = sitofp <3 x i32> %x to <3 x bfloat> @@ -31869,31 +31151,27 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -31910,28 +31188,24 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -31944,30 +31218,29 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v9, v0, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_bfe_u32 v10, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_and_or_b32 v11, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v10, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v3, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v5, v7, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -31976,32 +31249,32 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_and_or_b32 v9, v0, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_and_or_b32 v6, v3, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v4, v10, v1, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v7, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = sitofp <4 x i32> %x to <4 x bfloat> @@ -32063,8 +31336,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -32087,9 +31359,8 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -32100,7 +31371,6 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 @@ -32112,7 +31382,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -32124,7 +31394,6 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX11-NEXT: v_cls_i32_e32 v3, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 @@ -32141,7 +31410,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -32240,22 +31509,20 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 -; GFX8-NEXT: v_min_u32_e32 v7, v0, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 +; GFX8-NEXT: v_min_u32_e32 v6, v0, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v7 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -32285,21 +31552,19 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 +; GFX9-NEXT: v_min_u32_e32 v6, v0, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -32313,7 +31578,6 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3 ; GFX10-NEXT: v_ffbh_i32_e32 v6, v1 ; GFX10-NEXT: v_ffbh_i32_e32 v7, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 @@ -32336,9 +31600,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo @@ -32354,9 +31618,10 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: v_xor_b32_e32 v5, v2, v3 ; GFX11-NEXT: v_cls_i32_e32 v6, v1 ; GFX11-NEXT: v_cls_i32_e32 v7, v3 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v4 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_nc_u32_e32 v6, -1, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v7, -1, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -32385,9 +31650,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -32515,23 +31780,22 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v6, v2, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_ffbh_i32_e32 v5, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6 +; GFX8-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, v2, v3 -; GFX8-NEXT: v_ffbh_i32_e32 v6, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7 -; GFX8-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] ; GFX8-NEXT: v_ldexp_f32 v0, v0, v4 ; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -32539,17 +31803,15 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -32565,30 +31827,29 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 -; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX9-NEXT: v_ffbh_i32_e32 v6, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc ; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 ; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 @@ -32597,21 +31858,19 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v5 +; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -32638,7 +31897,6 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_min_u32_e32 v8, v10, v8 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v6, v6, v7 ; GFX10-NEXT: v_min_u32_e32 v7, v11, v9 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] @@ -32660,13 +31918,13 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-NEXT: v_ldexp_f32 v1, v1, v6 ; GFX10-NEXT: v_ldexp_f32 v2, v2, v4 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v2, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo @@ -32820,10 +32078,9 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 -; GFX8-NEXT: v_min_u32_e32 v11, v4, v5 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX8-NEXT: v_min_u32_e32 v10, v4, v5 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 @@ -32836,7 +32093,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9 ; GFX8-NEXT: v_min_u32_e32 v8, v8, v9 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 @@ -32844,8 +32101,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 @@ -32858,22 +32114,20 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 -; GFX8-NEXT: v_min_u32_e32 v9, v0, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 +; GFX8-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v9 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -32905,34 +32159,32 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 ; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v11, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v10 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX9-NEXT: v_min_u32_e32 v10, v4, v5 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 +; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v9, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v8, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9 -; GFX9-NEXT: v_add_u32_e32 v8, -1, v8 -; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 -; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc ; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 ; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 ; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 @@ -32941,21 +32193,19 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v9, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 +; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9 +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -32989,7 +32239,6 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] ; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] ; GFX10-NEXT: v_min_u32_e32 v9, v12, v9 ; GFX10-NEXT: v_min_u32_e32 v11, v13, v14 @@ -33015,21 +32264,21 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX10-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v5 -; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v6 ; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v9, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v6, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v7, v8, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo @@ -33065,16 +32314,15 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_add_nc_u32_e32 v13, -1, v13 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_u32_e32 v9, v12, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_min_u32_e32 v11, v13, v14 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-NEXT: v_min_u32_e32 v5, 1, v6 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v8 @@ -33096,21 +32344,21 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5 -; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v6 ; GFX11-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-NEXT: v_add3_u32 v4, v6, v3, 0x7fff ; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -33148,9 +32396,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33162,9 +32409,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33174,9 +32420,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -33187,11 +32432,10 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -33233,16 +32477,14 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33256,15 +32498,13 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -33276,12 +32516,11 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo @@ -33295,16 +32534,15 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -33357,22 +32595,19 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33384,25 +32619,22 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v4 -; GFX9-NEXT: v_add3_u32 v2, v2, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -33415,17 +32647,16 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -33484,30 +32715,26 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -33520,32 +32747,28 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v5 -; GFX9-NEXT: v_add3_u32 v3, v3, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -33558,23 +32781,22 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v9, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v11, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v7, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -33589,17 +32811,16 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v7, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33610,11 +32831,11 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v9, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_and_or_b32 v11, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -33652,9 +32873,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33666,9 +32886,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -33678,9 +32897,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -33691,10 +32909,9 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -33731,16 +32948,14 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -33754,15 +32969,13 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -33774,12 +32987,11 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo @@ -33793,13 +33005,12 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -33844,23 +33055,20 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -33877,21 +33085,18 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -33904,24 +33109,23 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] %op = uitofp <3 x i32> %x to <3 x bfloat> @@ -33964,31 +33168,27 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -34005,28 +33205,24 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -34039,30 +33235,29 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v2, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_and_or_b32 v9, v0, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_bfe_u32 v10, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_and_or_b32 v11, v1, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v10, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v3, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v5, v7, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -34071,32 +33266,32 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v2, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_and_or_b32 v9, v0, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v11, v1, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_and_or_b32 v6, v3, s0, 0x400000 -; GFX11-NEXT: v_add3_u32 v4, v10, v1, 0x7fff -; GFX11-NEXT: v_add3_u32 v5, v7, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp <4 x i32> %x to <4 x bfloat> @@ -34146,8 +33341,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -34166,9 +33360,8 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -34178,7 +33371,6 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 @@ -34187,7 +33379,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -34198,7 +33390,6 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] @@ -34211,7 +33402,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -34284,22 +33475,20 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX8-NEXT: v_min_u32_e32 v7, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 +; GFX8-NEXT: v_min_u32_e32 v6, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v7 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -34321,21 +33510,19 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 ; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 +; GFX9-NEXT: v_min_u32_e32 v6, 32, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -34347,7 +33534,6 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -34364,9 +33550,9 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo @@ -34380,7 +33566,6 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 @@ -34404,9 +33589,9 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -34503,16 +33688,15 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_ffbh_u32_e32 v6, v3 -; GFX8-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX8-NEXT: v_ffbh_u32_e32 v5, v3 +; GFX8-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v4 @@ -34522,17 +33706,15 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 ; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v2 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -34545,44 +33727,41 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX9-NEXT: v_ffbh_u32_e32 v6, v5 ; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX9-NEXT: v_ffbh_u32_e32 v6, v1 +; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc ; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 ; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 ; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v5 +; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -34596,7 +33775,6 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-NEXT: v_ffbh_u32_e32 v6, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v8, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v7, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v6, 32, v6 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX10-NEXT: v_min_u32_e32 v7, 32, v7 @@ -34620,13 +33798,13 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX10-NEXT: v_ldexp_f32 v1, v1, v7 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v7, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 @@ -34739,19 +33917,18 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4 ; GFX8-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX8-NEXT: v_min_u32_e32 v11, 32, v4 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX8-NEXT: v_and_b32_e32 v10, 0xff800000, v8 +; GFX8-NEXT: v_min_u32_e32 v10, 32, v4 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_ffbh_u32_e32 v8, v1 ; GFX8-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v10 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 @@ -34759,8 +33936,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v4 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 @@ -34769,22 +33945,20 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0 ; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX8-NEXT: v_min_u32_e32 v9, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 +; GFX8-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v9 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 @@ -34808,49 +33982,45 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 ; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX9-NEXT: v_min_u32_e32 v11, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v11, v[6:7] -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_ffbh_u32_e32 v8, v1 +; GFX9-NEXT: v_min_u32_e32 v10, 32, v4 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 ; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v11 +; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc ; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 ; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 ; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v9, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 +; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v9 +; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 ; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -34865,7 +34035,6 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_ffbh_u32_e32 v10, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v11, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v9, v7 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX10-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX10-NEXT: v_min_u32_e32 v11, 32, v11 @@ -34894,27 +34063,27 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v6 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_ldexp_f32 v4, v4, v9 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v3, v9, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v1, s4, 0x400000 +; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v4, s4, 0x400000 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -34925,9 +34094,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3 ; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v8, 32, v8 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -34948,41 +34118,42 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v10 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v4 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v11 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v4 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v6 -; GFX11-NEXT: v_ldexp_f32 v2, v2, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ldexp_f32 v2, v2, v8 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v5 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_ldexp_f32 v4, v4, v9 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 -; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v3, v9, v1, 0x7fff -; GFX11-NEXT: v_and_or_b32 v5, v1, s0, 0x400000 -; GFX11-NEXT: v_and_or_b32 v9, v4, s0, 0x400000 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = uitofp <4 x i64> %x to <4 x bfloat> @@ -40088,8 +39259,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -40104,9 +39274,8 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -40118,10 +39287,9 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v1, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -40134,11 +39302,10 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff @@ -40206,16 +39373,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -40234,16 +39399,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -40259,14 +39422,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4 ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 ; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 -; GFX10-NEXT: v_and_or_b32 v5, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff ; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo @@ -40284,15 +39446,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4 ; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_and_or_b32 v5, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -40375,8 +39536,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 @@ -40390,16 +39550,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -40416,9 +39574,8 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 @@ -40429,16 +39586,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -40460,16 +39615,15 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1 -; GFX10-NEXT: v_and_or_b32 v3, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1 ; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v7, v4, s4, 0x400000 -; GFX10-NEXT: v_and_or_b32 v8, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 ; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff ; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -40572,17 +39726,15 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 @@ -40595,16 +39747,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -40625,16 +39775,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 @@ -40645,16 +39793,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -40681,22 +39827,21 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6 ; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff -; GFX10-NEXT: v_and_or_b32 v1, v6, s4, 0x400000 ; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1 ; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1 -; GFX10-NEXT: v_and_or_b32 v9, v5, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo ; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff ; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff -; GFX10-NEXT: v_and_or_b32 v3, v7, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v4, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo @@ -40717,14 +39862,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3 ; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-NEXT: v_and_or_b32 v1, v6, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -40736,14 +39880,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8 ; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 ; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff -; GFX11-NEXT: v_and_or_b32 v9, v5, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_add3_u32 v6, v8, v4, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v3, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 @@ -40803,8 +39947,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -40813,8 +39956,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -40828,18 +39970,16 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -40850,10 +39990,9 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v3, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo @@ -40861,7 +40000,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo @@ -40873,11 +40012,10 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff @@ -40887,7 +40025,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v2, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -40958,8 +40096,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -40971,16 +40108,14 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -40989,8 +40124,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -41005,36 +40139,32 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -41048,14 +40178,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1 -; GFX10-NEXT: v_and_or_b32 v5, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_and_or_b32 v6, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff @@ -41066,13 +40195,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v4, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v5, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -41086,19 +40215,17 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v5, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v6, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -41109,7 +40236,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-NEXT: v_and_or_b32 v4, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 @@ -41117,7 +40244,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_and_or_b32 v5, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo @@ -41210,8 +40337,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -41221,8 +40347,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -41231,8 +40356,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -41243,16 +40367,14 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -41261,8 +40383,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -41278,54 +40399,48 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -41343,41 +40458,40 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 ; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v8, v1, s4, 0x400000 -; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-NEXT: v_and_or_b32 v9, v3, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX10-NEXT: v_and_or_b32 v10, v0, s4, 0x400000 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_and_or_b32 v7, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff @@ -41492,8 +40606,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 @@ -41505,16 +40618,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -41523,8 +40634,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v1 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 @@ -41533,8 +40643,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -41545,16 +40654,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -41563,8 +40670,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -41581,72 +40687,64 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 @@ -41667,45 +40765,44 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xff800000 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX10-NEXT: v_and_or_b32 v3, v6, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_and_or_b32 v6, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_and_or_b32 v10, v7, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX10-NEXT: v_and_or_b32 v12, v0, s4, 0x400000 ; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX10-NEXT: v_and_or_b32 v5, v3, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v9, v1, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff ; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 @@ -41713,10 +40810,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff ; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff -; GFX10-NEXT: v_and_or_b32 v6, v2, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX10-NEXT: v_and_or_b32 v8, v0, s4, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo @@ -41736,7 +40833,6 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2 @@ -41744,20 +40840,20 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-NEXT: v_mul_f32_e32 v7, v9, v7 -; GFX11-NEXT: v_and_or_b32 v3, v6, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2 ; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_and_or_b32 v10, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff ; GFX11-NEXT: v_bfe_u32 v11, v0, 16, 1 ; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v11, v11, v0, 0x7fff ; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4 @@ -41769,7 +40865,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v9, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -41781,7 +40877,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_and_or_b32 v5, v3, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-NEXT: v_add3_u32 v4, v7, v3, 0x7fff ; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 @@ -41789,10 +40885,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff ; GFX11-NEXT: v_add3_u32 v5, v7, v2, 0x7fff -; GFX11-NEXT: v_and_or_b32 v6, v2, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff -; GFX11-NEXT: v_and_or_b32 v8, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index d35871e3774de..99b163dc9753b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -790,8 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -806,9 +805,8 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index 9142858806f1c..5889de7faf3e5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s2, -4 ; GFX10-NEXT: s_mov_b32 s1, s3 @@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo @@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s5, 0xff800000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s2, -4 @@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo @@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX900-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1 +; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s2, -4 ; GFX10-NEXT: s_mov_b32 s1, s3 @@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX10-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo @@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s5, 0xff800000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s2, -4 @@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_and_or_b32 v4, v1, s5, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index 6a7fb7142c293..ba946fe00a8c5 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -912,10 +912,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 - ; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) @@ -934,10 +933,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 - ; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) @@ -956,10 +954,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 - ; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec - ; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) @@ -978,10 +975,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 ; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608 - ; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec - ; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] ; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 48ae98f125bf4..5e76dfd9bdddb 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1413,9 +1413,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 ; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 -; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: v_and_b32_e32 v5, v4, v2 @@ -1451,9 +1450,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1560,9 +1558,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_add_f32_e32 v4, 4.0, v4 ; VI-NEXT: v_bfe_u32 v6, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v4 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; VI-NEXT: v_and_b32_e32 v5, v3, v2 @@ -1597,9 +1594,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 672c93b6adf7f..66c49ba8b734d 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4259,65 +4259,57 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7 ; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1 -; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 ; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2 ; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7 +; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1 ; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1 ; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8 ; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v8 ; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 ; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1 ; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2 ; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3 ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2 ; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4 ; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v4 ; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc @@ -4332,7 +4324,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xff800000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] @@ -4355,20 +4346,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9 ; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4 ; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX10-NEXT: v_and_or_b32 v8, v7, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7 ; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX10-NEXT: v_and_or_b32 v12, v0, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff ; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1 ; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff ; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX10-NEXT: v_and_or_b32 v16, v1, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: v_and_or_b32 v14, v11, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -4382,7 +4373,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo -; GFX10-NEXT: v_and_or_b32 v8, v4, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -4390,14 +4381,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, 0x400000 -; GFX10-NEXT: v_and_or_b32 v10, v1, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff ; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_and_or_b32 v12, v7, s2, 0x400000 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo @@ -4416,7 +4407,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0xff800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -4438,11 +4428,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX11-NEXT: v_and_or_b32 v14, v11, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 -; GFX11-NEXT: v_and_or_b32 v16, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff @@ -4450,11 +4440,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 ; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX11-NEXT: v_and_or_b32 v8, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 ; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff -; GFX11-NEXT: v_and_or_b32 v12, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 ; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo @@ -4466,7 +4456,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v8, v4, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4480,14 +4470,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-NEXT: v_and_or_b32 v12, v7, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10 -; GFX11-NEXT: v_and_or_b32 v10, v1, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_and_or_b32 v3, v0, s0, 0x400000 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff From 828bf134d732a29146d1dd666548c75b49012b08 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Wed, 21 Feb 2024 14:50:21 -0800 Subject: [PATCH 157/351] [InstallAPI] Cleanup HeaderFile Interface & options handling, NFC (#82544) --- clang/include/clang/InstallAPI/HeaderFile.h | 3 +++ clang/tools/clang-installapi/Options.cpp | 14 ++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/InstallAPI/HeaderFile.h b/clang/include/clang/InstallAPI/HeaderFile.h index 6ccd944f8b01b..fc64a43b3def5 100644 --- a/clang/include/clang/InstallAPI/HeaderFile.h +++ b/clang/include/clang/InstallAPI/HeaderFile.h @@ -21,6 +21,8 @@ namespace clang::installapi { enum class HeaderType { + /// Unset or unknown type. + Unknown, /// Represents declarations accessible to all clients. Public, /// Represents declarations accessible to a disclosed set of clients. @@ -41,6 +43,7 @@ class HeaderFile { std::optional Language; public: + HeaderFile() = delete; HeaderFile(StringRef FullPath, HeaderType Type, StringRef IncludeName = StringRef(), std::optional Language = std::nullopt) diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp index 08d1c0e8e660f..562a643edfcf4 100644 --- a/clang/tools/clang-installapi/Options.cpp +++ b/clang/tools/clang-installapi/Options.cpp @@ -22,14 +22,7 @@ namespace installapi { bool Options::processDriverOptions(InputArgList &Args) { // Handle inputs. - llvm::vfs::Status Stat; - for (const auto &Path : Args.getAllArgValues(OPT_INPUT)) { - if (FM->getNoncachedStatValue(Path, Stat) || !Stat.exists()) { - Diags->Report(clang::diag::err_drv_no_such_file) << Path; - return false; - } - DriverOpts.FileLists.push_back(std::move(Path)); - } + llvm::append_range(DriverOpts.FileLists, Args.getAllArgValues(OPT_INPUT)); // Handle output. SmallString OutputPath; @@ -61,8 +54,9 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target triples first. if (ArgTarget) { - for (auto *Arg : Args.filtered(OPT_target)) { - llvm::Triple TargetTriple(Arg->getValue()); + for (const Arg *A : Args.filtered(OPT_target)) { + A->claim(); + llvm::Triple TargetTriple(A->getValue()); Target TAPITarget = Target(TargetTriple); if ((TAPITarget.Arch == AK_unknown) || (TAPITarget.Platform == PLATFORM_UNKNOWN)) { From 049e142badfca3fae5c190c5d4b37acdd2e9c10c Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 21 Feb 2024 17:04:31 -0600 Subject: [PATCH 158/351] [libc] Fix startup utilities failing to install in full build mode (#82522) Summary: Currently, doing `ninja install` will fail in fullbuild mode due to the startup utilities not being built by default. This was hidden previously by the fact that if tests were run, it would build the startup utilities and thus they would be present. This patch solves this issue by making the `libc-startup` target a dependncy on the final library. Furthermore we simply factor out the library install directory into the base CMake directory next to the include directory handling. This change makes the `crt` files get installed in `lib/x86_64-unknown-linu-gnu` instead of just `lib`. This fixes an error I had where doing a runtimes failed to install its libraries because the install step always errored. --- libc/CMakeLists.txt | 9 +++++++++ libc/lib/CMakeLists.txt | 12 +++--------- libc/startup/linux/CMakeLists.txt | 3 ++- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 3d77573661674..616beae13d9aa 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -225,6 +225,15 @@ else() set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}) endif() +if(LIBC_TARGET_TRIPLE) + set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}/${LIBC_TARGET_TRIPLE}) +elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT LIBC_GPU_BUILD) + set(LIBC_INSTALL_LIBRARY_DIR + lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}) +else() + set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}) +endif() + if(LIBC_TARGET_ARCHITECTURE_IS_GPU) include(prepare_libc_gpu_build) set(LIBC_ENABLE_UNITTESTS OFF) diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt index af7ef2de93dd4..c1a804232c1f5 100644 --- a/libc/lib/CMakeLists.txt +++ b/libc/lib/CMakeLists.txt @@ -35,19 +35,13 @@ foreach(archive IN ZIP_LISTS ) if(LLVM_LIBC_FULL_BUILD) target_link_libraries(${archive_1} PUBLIC libc-headers) + if(TARGET libc-startup) + add_dependencies(${archive_1} libc-startup) + endif() endif() list(APPEND added_archive_targets ${archive_1}) endforeach() -if(LIBC_TARGET_TRIPLE) - set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}/${LIBC_TARGET_TRIPLE}) -elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT LIBC_GPU_BUILD) - set(LIBC_INSTALL_LIBRARY_DIR - lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}) -else() - set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}) -endif() - install( TARGETS ${added_archive_targets} ARCHIVE DESTINATION ${LIBC_INSTALL_LIBRARY_DIR} diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt index 39bcca9cdba9f..a287bc4d633d4 100644 --- a/libc/startup/linux/CMakeLists.txt +++ b/libc/startup/linux/CMakeLists.txt @@ -131,7 +131,8 @@ foreach(target IN LISTS startup_components) set(fq_target_name libc.startup.linux.${target}) add_dependencies(libc-startup ${fq_target_name}) install(FILES $ - DESTINATION ${CMAKE_INSTALL_LIBDIR} + DESTINATION ${LIBC_INSTALL_LIBRARY_DIR} RENAME $ + EXCLUDE_FROM_ALL COMPONENT libc) endforeach() From 300425cea51ef566a4d38e57afd9a7ae8024a682 Mon Sep 17 00:00:00 2001 From: Zixu Wang <9819235+zixu-w@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:12:50 -0800 Subject: [PATCH 159/351] =?UTF-8?q?Revert=20"[Docs]=20Add=20release=20note?= =?UTF-8?q?=20about=20Clang-defined=20target=20OS=20macros=20=E2=80=A6=20(?= =?UTF-8?q?#80045)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(#79879)" This reverts commit b40d5b1b08564d23d5e0769892ebbc32447b2987. The target OS macros work is included in the 18.x release. Move the release note to the release branch (https://github.com/llvm/llvm-project/pull/80044). --- clang/docs/ReleaseNotes.rst | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ef2d9b8e46ae4..bac166e6c3562 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -62,21 +62,6 @@ Clang Frontend Potentially Breaking Changes of ``-Wno-gnu-binary-literal`` will no longer silence this pedantic warning, which may break existing uses with ``-Werror``. -Target OS macros extension -^^^^^^^^^^^^^^^^^^^^^^^^^^ -A new Clang extension (see :ref:`here `) is enabled for -Darwin (Apple platform) targets. Clang now defines ``TARGET_OS_*`` macros for -these targets, which could break existing code bases with improper checks for -the ``TARGET_OS_`` macros. For example, existing checks might fail to include -the ``TargetConditionals.h`` header from Apple SDKs and therefore leaving the -macros undefined and guarded code unexercised. - -Affected code should be checked to see if it's still intended for the specific -target and fixed accordingly. - -The extension can be turned off by the option ``-fno-define-target-os-macros`` -as a workaround. - What's New in Clang |release|? ============================== Some of the major new features and improvements to Clang are listed @@ -161,17 +146,6 @@ Non-comprehensive list of changes in this release New Compiler Flags ------------------ -.. _target_os_detail: - -Target OS macros extension -^^^^^^^^^^^^^^^^^^^^^^^^^^ -A pair of new flags ``-fdefine-target-os-macros`` and -``-fno-define-target-os-macros`` has been added to Clang to enable/disable the -extension to provide built-in definitions of a list of ``TARGET_OS_*`` macros -based on the target triple. - -The extension is enabled by default for Darwin (Apple platform) targets. - Deprecated Compiler Flags ------------------------- From 699c408c88b3ed02f25464aa868bd48454fbba3f Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Fri, 16 Feb 2024 11:13:24 -0800 Subject: [PATCH 160/351] [NFC][HWASAN] Fix misleading name --- .../Transforms/Instrumentation/HWAddressSanitizer.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 393afc9152055..33add6d4cd767 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -348,7 +348,7 @@ class HWAddressSanitizer { void instrumentGlobals(); Value *getPC(IRBuilder<> &IRB); - Value *getSP(IRBuilder<> &IRB); + Value *getFP(IRBuilder<> &IRB); Value *getFrameRecordInfo(IRBuilder<> &IRB); void instrumentPersonalityFunctions(); @@ -1148,7 +1148,7 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) { // Extract some entropy from the stack pointer for the tags. // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ // between functions). - Value *StackPointerLong = getSP(IRB); + Value *StackPointerLong = getFP(IRB); Value *StackTag = applyTagMask(IRB, IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20))); @@ -1165,7 +1165,7 @@ Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag, } Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB) { - Value *StackPointerLong = getSP(IRB); + Value *StackPointerLong = getFP(IRB); Value *UARTag = applyTagMask(IRB, IRB.CreateLShr(StackPointerLong, PointerTagShift)); @@ -1232,7 +1232,7 @@ Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) { return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy); } -Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) { +Value *HWAddressSanitizer::getFP(IRBuilder<> &IRB) { if (!CachedSP) { // FIXME: use addressofreturnaddress (but implement it in aarch64 backend // first). @@ -1251,7 +1251,7 @@ Value *HWAddressSanitizer::getSP(IRBuilder<> &IRB) { Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) { // Prepare ring buffer data. Value *PC = getPC(IRB); - Value *SP = getSP(IRB); + Value *SP = getFP(IRB); // Mix SP and PC. // Assumptions: From 9ea9e93f4a74b363887b773397bcb134062270d9 Mon Sep 17 00:00:00 2001 From: Yuta Mukai Date: Thu, 22 Feb 2024 09:17:10 +0900 Subject: [PATCH 161/351] [MachinePipeliner] Fix elements being added while the list is iterated (#80805) There is no need to add the elements of Objs twice, so the addition is removed. --- llvm/lib/CodeGen/MachinePipeliner.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 697e0da094422..1bda19b2e6e92 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -768,7 +768,6 @@ static void getUnderlyingObjects(const MachineInstr *MI, Objs.clear(); return; } - Objs.push_back(V); } } From 640e781dc87bdb74e14a66c89e54417e60150904 Mon Sep 17 00:00:00 2001 From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:18:18 -0800 Subject: [PATCH 162/351] [BOLT][DWARF][NFC] Use SkeletonCU in place of IsDWO check (#82540) Changed isDWO to a function that checks Skeleton CU that is passed in. This is for preparation for https://github.com/llvm/llvm-project/pull/81062. --- bolt/include/bolt/Core/DIEBuilder.h | 8 ++++++-- bolt/lib/Core/DIEBuilder.cpp | 20 ++++++++++---------- bolt/lib/Rewrite/DWARFRewriter.cpp | 2 +- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h index f13d42ff4ab42..f0db924e2ccbb 100644 --- a/bolt/include/bolt/Core/DIEBuilder.h +++ b/bolt/include/bolt/Core/DIEBuilder.h @@ -124,7 +124,7 @@ class DIEBuilder { std::vector> Abbreviations; BinaryContext &BC; DWARFContext *DwarfContext{nullptr}; - bool IsDWO{false}; + DWARFUnit *SkeletonCU{nullptr}; uint64_t UnitSize{0}; llvm::DenseSet AllProcessed; @@ -264,8 +264,12 @@ class DIEBuilder { /// current Section. DIE *constructDIEFast(DWARFDie &DDie, DWARFUnit &U, uint32_t UnitId); + /// Returns true if this DIEBUilder is for DWO Unit. + bool isDWO() const { return SkeletonCU != nullptr; } + public: - DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext, bool IsDWO = false); + DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext, + DWARFUnit *SkeletonCU = nullptr); /// Returns enum to what we are currently processing. ProcessingType getCurrentProcessingState() { return getState().Type; } diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index 3c72c745086b5..e6104b81bf6c9 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -179,8 +179,8 @@ void DIEBuilder::constructFromUnit(DWARFUnit &DU) { } DIEBuilder::DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext, - bool IsDWO) - : BC(BC), DwarfContext(DwarfContext), IsDWO(IsDWO) {} + DWARFUnit *SkeletonCU) + : BC(BC), DwarfContext(DwarfContext), SkeletonCU(SkeletonCU) {} static unsigned int getCUNum(DWARFContext *DwarfContext, bool IsDWO) { unsigned int CUNum = IsDWO ? DwarfContext->getNumDWOCompileUnits() @@ -204,11 +204,11 @@ void DIEBuilder::buildTypeUnits(DebugStrOffsetsWriter *StrOffsetWriter, true); } } - const unsigned int CUNum = getCUNum(DwarfContext, IsDWO); + const unsigned int CUNum = getCUNum(DwarfContext, isDWO()); getState().CloneUnitCtxMap.resize(CUNum); DWARFContext::unit_iterator_range CU4TURanges = - IsDWO ? DwarfContext->dwo_types_section_units() - : DwarfContext->types_section_units(); + isDWO() ? DwarfContext->dwo_types_section_units() + : DwarfContext->types_section_units(); getState().Type = ProcessingType::DWARF4TUs; for (std::unique_ptr &DU : CU4TURanges) @@ -218,8 +218,8 @@ void DIEBuilder::buildTypeUnits(DebugStrOffsetsWriter *StrOffsetWriter, constructFromUnit(*DU.get()); DWARFContext::unit_iterator_range CURanges = - IsDWO ? DwarfContext->dwo_info_section_units() - : DwarfContext->info_section_units(); + isDWO() ? DwarfContext->dwo_info_section_units() + : DwarfContext->info_section_units(); // This handles DWARF4 CUs and DWARF5 CU/TUs. // Creating a vector so that for reference handling only DWARF5 CU/TUs are @@ -242,11 +242,11 @@ void DIEBuilder::buildCompileUnits(const bool Init) { if (Init) BuilderState.reset(new State()); - unsigned int CUNum = getCUNum(DwarfContext, IsDWO); + unsigned int CUNum = getCUNum(DwarfContext, isDWO()); getState().CloneUnitCtxMap.resize(CUNum); DWARFContext::unit_iterator_range CURanges = - IsDWO ? DwarfContext->dwo_info_section_units() - : DwarfContext->info_section_units(); + isDWO() ? DwarfContext->dwo_info_section_units() + : DwarfContext->info_section_units(); // This handles DWARF4 CUs and DWARF5 CU/TUs. // Creating a vector so that for reference handling only DWARF5 CU/TUs are diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index a77f401c64c8c..849c363730ebb 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -709,7 +709,7 @@ void DWARFRewriter::updateDebugInfo() { : LegacyRangesSectionWriter.get(); // Skipping CUs that failed to load. if (SplitCU) { - DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), true); + DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), Unit); DWODIEBuilder.buildDWOUnit(**SplitCU); std::string DWOName = updateDWONameCompDir( *Unit, *DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit)); From 004c1972b4585fe8051814ceb6c6cdbf3cb62290 Mon Sep 17 00:00:00 2001 From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:48:02 -0800 Subject: [PATCH 163/351] [BOLT][DWARF][NFC] Expose DebugStrOffsetsWriter::clear (#82548) Refactored cod that clears data-structures in DebugStrOffsetsWriter into clear() function and made initialize() public. This is for https://github.com/llvm/llvm-project/pull/81062. --- bolt/include/bolt/Core/DebugData.h | 8 +++++++- bolt/lib/Core/DebugData.cpp | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h index 31a636ba2ce65..48b813a4ca11f 100644 --- a/bolt/include/bolt/Core/DebugData.h +++ b/bolt/include/bolt/Core/DebugData.h @@ -450,10 +450,16 @@ class DebugStrOffsetsWriter { return std::move(StrOffsetsBuffer); } -private: /// Initializes Buffer and Stream. void initialize(DWARFUnit &Unit); + /// Clear data. + void clear() { + IndexToAddressMap.clear(); + StrOffsets.clear(); + } + +private: std::unique_ptr StrOffsetsBuffer; std::unique_ptr StrOffsetsStream; std::map IndexToAddressMap; diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp index 2942f0b9190fa..a75016ede3090 100644 --- a/bolt/lib/Core/DebugData.cpp +++ b/bolt/lib/Core/DebugData.cpp @@ -909,8 +909,7 @@ void DebugStrOffsetsWriter::finalizeSection(DWARFUnit &Unit, } StrOffsetSectionWasModified = false; - IndexToAddressMap.clear(); - StrOffsets.clear(); + clear(); } void DebugStrWriter::create() { From f204aee1b9173ed9ae72017808f0a379c3a8de7a Mon Sep 17 00:00:00 2001 From: Fabian Mora Date: Wed, 21 Feb 2024 20:47:19 -0500 Subject: [PATCH 164/351] [mlir][GPU] Remove the SerializeToCubin pass (#82486) The `SerializeToCubin` pass was deprecated in September 2023 in favor of GPU compilation attributes; see the [GPU compilation](https://mlir.llvm.org/docs/Dialects/GPU/#gpu-compilation) section in the `gpu` dialect MLIR docs. This patch removes `SerializeToCubin` from the repo. --- mlir/CMakeLists.txt | 1 - .../mlir/Dialect/GPU/Transforms/Passes.h | 14 -- mlir/lib/Dialect/GPU/CMakeLists.txt | 52 ----- .../GPU/Transforms/SerializeToCubin.cpp | 180 ------------------ 4 files changed, 247 deletions(-) delete mode 100644 mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index 2d9f78e03ba76..16c898bdeb6e0 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -123,7 +123,6 @@ else() endif() add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS}) -set(MLIR_ENABLE_DEPRECATED_GPU_SERIALIZATION 0 CACHE BOOL "Enable deprecated GPU serialization passes") set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner") set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner") set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner") diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index 5885facd07541..8f7466a697d85 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -147,25 +147,11 @@ class SerializeToBlobPass : public OperationPass { // Registration //===----------------------------------------------------------------------===// -/// Register pass to serialize GPU kernel functions to a CUBIN binary -/// annotation. -LLVM_DEPRECATED("use Target attributes instead", "") -void registerGpuSerializeToCubinPass(); - /// Register pass to serialize GPU kernel functions to a HSAco binary /// annotation. LLVM_DEPRECATED("use Target attributes instead", "") void registerGpuSerializeToHsacoPass(); -/// Create an instance of the GPU kernel function to CUBIN binary serialization -/// pass with optLevel (default level 2). -LLVM_DEPRECATED("use Target attributes instead", "") -std::unique_ptr createGpuSerializeToCubinPass(StringRef triple, - StringRef chip, - StringRef features, - int optLevel = 2, - bool dumpPtx = false); - /// Create an instance of the GPU kernel function to HSAco binary serialization /// pass. LLVM_DEPRECATED("use Target attributes instead", "") diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index e5776e157b612..51cfa2216e0c1 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -1,11 +1,3 @@ -if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) - set(NVPTX_LIBS - NVPTXCodeGen - NVPTXDesc - NVPTXInfo - ) -endif() - if (MLIR_ENABLE_ROCM_CONVERSIONS) set(AMDGPU_LIBS IRReader @@ -60,7 +52,6 @@ add_mlir_dialect_library(MLIRGPUTransforms Transforms/ParallelLoopMapper.cpp Transforms/ROCDLAttachTarget.cpp Transforms/SerializeToBlob.cpp - Transforms/SerializeToCubin.cpp Transforms/SerializeToHsaco.cpp Transforms/ShuffleRewriter.cpp Transforms/SPIRVAttachTarget.cpp @@ -74,7 +65,6 @@ add_mlir_dialect_library(MLIRGPUTransforms Core MC Target - ${NVPTX_LIBS} ${AMDGPU_LIBS} DEPENDS @@ -110,48 +100,6 @@ add_mlir_dialect_library(MLIRGPUTransforms add_subdirectory(TransformOps) add_subdirectory(Pipelines) -if(MLIR_ENABLE_CUDA_RUNNER) - if(NOT MLIR_ENABLE_CUDA_CONVERSIONS) - message(SEND_ERROR - "Building mlir with cuda support requires the NVPTX backend") - endif() - - # Configure CUDA language support. Using check_language first allows us to - # give a custom error message. - include(CheckLanguage) - check_language(CUDA) - if (CMAKE_CUDA_COMPILER) - enable_language(CUDA) - else() - message(SEND_ERROR - "Building mlir with cuda support requires a working CUDA install") - endif() - - # Enable gpu-to-cubin pass. - target_compile_definitions(obj.MLIRGPUTransforms - PRIVATE - MLIR_GPU_TO_CUBIN_PASS_ENABLE=1 - ) - - # Add CUDA headers includes and the libcuda.so library. - target_include_directories(obj.MLIRGPUTransforms - PRIVATE - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - ) - - # Add link path for the cuda driver library. - find_library(CUDA_DRIVER_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) - get_filename_component(CUDA_DRIVER_LIBRARY_PATH "${CUDA_DRIVER_LIBRARY}" DIRECTORY) - target_link_directories(MLIRGPUTransforms PRIVATE ${CUDA_DRIVER_LIBRARY_PATH}) - - target_link_libraries(MLIRGPUTransforms - PRIVATE - MLIRNVVMToLLVMIRTranslation - cuda - ) - -endif() - if(MLIR_ENABLE_ROCM_CONVERSIONS) if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)) message(SEND_ERROR diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp deleted file mode 100644 index 34ad4e6868e15..0000000000000 --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp +++ /dev/null @@ -1,180 +0,0 @@ -//===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass that serializes a gpu module into CUBIN blob and -// adds that blob as a string attribute of the module. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/GPU/Transforms/Passes.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" -#include "llvm/Support/Debug.h" - -#if MLIR_GPU_TO_CUBIN_PASS_ENABLE -#include "mlir/Pass/Pass.h" -#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Export.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/Threading.h" - -#include - -using namespace mlir; - -static void emitCudaError(const llvm::Twine &expr, const char *buffer, - CUresult result, Location loc) { - const char *error = nullptr; - cuGetErrorString(result, &error); - emitError(loc, - expr.concat(error ? " failed with error code " + llvm::Twine{error} - : llvm::Twine(" failed with unknown error ")) - .concat("[") - .concat(buffer) - .concat("]")); -} - -#define RETURN_ON_CUDA_ERROR(expr) \ - do { \ - if (auto status = (expr)) { \ - emitCudaError(#expr, jitErrorBuffer, status, loc); \ - return {}; \ - } \ - } while (false) - -namespace { -class SerializeToCubinPass - : public PassWrapper { - static llvm::once_flag initializeBackendOnce; - -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToCubinPass) - - SerializeToCubinPass(StringRef triple = "nvptx64-nvidia-cuda", - StringRef chip = "sm_35", StringRef features = "+ptx60", - int optLevel = 2, bool dumpPtx = false); - - StringRef getArgument() const override { return "gpu-to-cubin"; } - StringRef getDescription() const override { - return "Lower GPU kernel function to CUBIN binary annotations"; - } - -private: - // Serializes PTX to CUBIN. - std::unique_ptr> - serializeISA(const std::string &isa) override; -}; -} // namespace - -// Sets the 'option' to 'value' unless it already has a value. -static void maybeSetOption(Pass::Option &option, StringRef value) { - if (!option.hasValue()) - option = value.str(); -} - -llvm::once_flag SerializeToCubinPass::initializeBackendOnce; - -SerializeToCubinPass::SerializeToCubinPass(StringRef triple, StringRef chip, - StringRef features, int optLevel, - bool dumpPtx) { - // No matter how this pass is constructed, ensure that the NVPTX backend - // is initialized exactly once. - llvm::call_once(initializeBackendOnce, []() { - // Initialize LLVM NVPTX backend. -#if LLVM_HAS_NVPTX_TARGET - LLVMInitializeNVPTXTarget(); - LLVMInitializeNVPTXTargetInfo(); - LLVMInitializeNVPTXTargetMC(); - LLVMInitializeNVPTXAsmPrinter(); -#endif - }); - - maybeSetOption(this->triple, triple); - maybeSetOption(this->chip, chip); - maybeSetOption(this->features, features); - this->dumpPtx = dumpPtx; - if (this->optLevel.getNumOccurrences() == 0) - this->optLevel.setValue(optLevel); -} - -std::unique_ptr> -SerializeToCubinPass::serializeISA(const std::string &isa) { - Location loc = getOperation().getLoc(); - char jitErrorBuffer[4096] = {0}; - - RETURN_ON_CUDA_ERROR(cuInit(0)); - - // Linking requires a device context. - CUdevice device; - RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0)); - CUcontext context; - // Use the primary context. - RETURN_ON_CUDA_ERROR(cuDevicePrimaryCtxRetain(&context, device)); - // Push the primary context so that the next CUDA operations - // actually use it. - RETURN_ON_CUDA_ERROR(cuCtxPushCurrent(context)); - CUlinkState linkState; - - CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}; - void *jitOptionsVals[] = {jitErrorBuffer, - reinterpret_cast(sizeof(jitErrorBuffer))}; - - RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */ - jitOptions, /* jit options */ - jitOptionsVals, /* jit option values */ - &linkState)); - - auto kernelName = getOperation().getName().str(); - if (dumpPtx) { - llvm::dbgs() << " Kernel Name : [" << kernelName << "]\n"; - llvm::dbgs() << isa << "\n"; - } - RETURN_ON_CUDA_ERROR(cuLinkAddData( - linkState, CUjitInputType::CU_JIT_INPUT_PTX, - const_cast(static_cast(isa.c_str())), isa.length(), - kernelName.c_str(), 0, /* number of jit options */ - nullptr, /* jit options */ - nullptr /* jit option values */ - )); - - void *cubinData; - size_t cubinSize; - RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize)); - - char *cubinAsChar = static_cast(cubinData); - auto result = - std::make_unique>(cubinAsChar, cubinAsChar + cubinSize); - - // This will also destroy the cubin data. - RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState)); - // Pop and release the primary context. - CUcontext poppedContext; - RETURN_ON_CUDA_ERROR(cuCtxPopCurrent(&poppedContext)); - RETURN_ON_CUDA_ERROR(cuDevicePrimaryCtxRelease(device)); - - return result; -} - -// Register pass to serialize GPU kernel functions to a CUBIN binary annotation. -void mlir::registerGpuSerializeToCubinPass() { - PassRegistration registerSerializeToCubin( - [] { return std::make_unique(); }); -} - -std::unique_ptr mlir::createGpuSerializeToCubinPass(StringRef triple, - StringRef arch, - StringRef features, - int optLevel, - bool dumpPtx) { - return std::make_unique(triple, arch, features, - optLevel, dumpPtx); -} - -#else // MLIR_GPU_TO_CUBIN_PASS_ENABLE -void mlir::registerGpuSerializeToCubinPass() {} -#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE From 4c0fdcdb33076e936327cb0743c827f019a8e1ff Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Wed, 21 Feb 2024 19:50:29 -0600 Subject: [PATCH 165/351] [Hexagon] Generate absolute-set load/store instructions. (#82034) The optimization finds the loads/stores of a specific form and translate the first load/store to an absolute-set form there by optimizing out the transfer and eliminate the constant extenders. --- llvm/lib/Target/Hexagon/CMakeLists.txt | 1 + .../Target/Hexagon/HexagonGenMemAbsolute.cpp | 274 ++++++++++++++++++ .../Target/Hexagon/HexagonTargetMachine.cpp | 9 + .../CodeGen/Hexagon/load-const-extend-opt.ll | 68 +++++ .../CodeGen/Hexagon/store-const-extend-opt.ll | 72 +++++ 5 files changed, 424 insertions(+) create mode 100644 llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp create mode 100644 llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll create mode 100644 llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index 76f99b4d3ec58..753f3dcc88e19 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_target(HexagonCodeGen HexagonFrameLowering.cpp HexagonGenExtract.cpp HexagonGenInsert.cpp + HexagonGenMemAbsolute.cpp HexagonGenMux.cpp HexagonGenPredicate.cpp HexagonHardwareLoops.cpp diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp new file mode 100644 index 0000000000000..afd49631943f2 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -0,0 +1,274 @@ +//===--- HexagonGenMemAbsolute.cpp - Generate Load/Store Set Absolute ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This pass traverses through all the basic blocks in a function and converts +// an indexed load/store with offset "0" to a absolute-set load/store +// instruction as long as the use of the register in the new instruction +// dominates the rest of the uses and there are more than 2 uses. + +#include "HexagonTargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "hexagon-abs" + +using namespace llvm; + +STATISTIC(HexagonNumLoadAbsConversions, + "Number of Load instructions converted to absolute-set form"); +STATISTIC(HexagonNumStoreAbsConversions, + "Number of Store instructions converted to absolute-set form"); + +namespace llvm { +FunctionPass *createHexagonGenMemAbsolute(); +void initializeHexagonGenMemAbsolutePass(PassRegistry &Registry); +} // namespace llvm + +namespace { + +class HexagonGenMemAbsolute : public MachineFunctionPass { + const HexagonInstrInfo *TII; + MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + +public: + static char ID; + HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) { + initializeHexagonGenMemAbsolutePass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Hexagon Generate Load/Store Set Absolute Address Instruction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + static bool isValidIndexedLoad(int &Opcode, int &NewOpcode); + static bool isValidIndexedStore(int &Opcode, int &NewOpcode); +}; +} // namespace + +char HexagonGenMemAbsolute::ID = 0; + +INITIALIZE_PASS(HexagonGenMemAbsolute, "hexagon-gen-load-absolute", + "Hexagon Generate Load/Store Set Absolute Address Instruction", + false, false) + +bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(Fn.getFunction())) + return false; + + TII = Fn.getSubtarget().getInstrInfo(); + MRI = &Fn.getRegInfo(); + TRI = Fn.getRegInfo().getTargetRegisterInfo(); + + MachineDominatorTree &MDT = getAnalysis(); + + // Loop over all of the basic blocks + for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); + MBBb != MBBe; ++MBBb) { + MachineBasicBlock *MBB = &*MBBb; + // Traverse the basic block + for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); + ++MII) { + MachineInstr *MI = &*MII; + int Opc = MI->getOpcode(); + if (Opc != Hexagon::CONST32 && Opc != Hexagon::A2_tfrsi) + continue; + + const MachineOperand &MO = MI->getOperand(0); + if (!MO.isReg() || !MO.isDef()) + continue; + + unsigned DstReg = MO.getReg(); + if (MRI->use_nodbg_empty(DstReg)) + continue; + + typedef MachineRegisterInfo::use_nodbg_iterator use_iterator; + use_iterator NextUseMI = MRI->use_nodbg_begin(DstReg); + + MachineInstr *NextMI = NextUseMI->getParent(); + int NextOpc = NextMI->getOpcode(); + int NewOpc; + bool IsLoad = isValidIndexedLoad(NextOpc, NewOpc); + + if (!IsLoad && !isValidIndexedStore(NextOpc, NewOpc)) + continue; + + // Base and Offset positions for load and store instructions + // Load R(dest), R(base), Imm -> R(dest) = mem(R(base) + Imm) + // Store R(base), Imm, R (src) -> mem(R(base) + Imm) = R(src) + unsigned BaseRegPos, ImmPos, RegPos; + if (!TII->getBaseAndOffsetPosition(*NextMI, BaseRegPos, ImmPos)) + continue; + RegPos = IsLoad ? 0 : 2; + + bool IsGlobal = MI->getOperand(1).isGlobal(); + if (!MI->getOperand(1).isImm() && !IsGlobal) + continue; + + const MachineOperand *BaseOp = nullptr; + int64_t Offset; + bool Scalable; + TII->getMemOperandWithOffset(*NextMI, BaseOp, Offset, Scalable, TRI); + + // Ensure BaseOp is non-null and register type. + if (!BaseOp || !BaseOp->isReg()) + continue; + + if (Scalable) + continue; + + unsigned BaseReg = BaseOp->getReg(); + if ((DstReg != BaseReg) || (Offset != 0)) + continue; + + const MachineOperand &MO0 = NextMI->getOperand(RegPos); + + if (!MO0.isReg()) + continue; + + unsigned LoadStoreReg = MO0.getReg(); + + // Store: Bail out if the src and base are same (def and use on same + // register). + if (LoadStoreReg == BaseReg) + continue; + + // Insert the absolute-set instruction "I" only if the use of the + // BaseReg in "I" dominates the rest of the uses of BaseReg and if + // there are more than 2 uses of this BaseReg. + bool Dominates = true; + unsigned Counter = 0; + for (use_iterator I = NextUseMI, E = MRI->use_nodbg_end(); I != E; ++I) { + Counter++; + if (!MDT.dominates(NextMI, I->getParent())) + Dominates = false; + } + + if ((!Dominates) || (Counter < 3)) + continue; + + // If we reach here, we have met all the conditions required for the + // replacement of the absolute instruction. + LLVM_DEBUG({ + dbgs() << "Found a pair of instructions for absolute-set " + << (IsLoad ? "load" : "store") << "\n"; + dbgs() << *MI; + dbgs() << *NextMI; + }); + MachineBasicBlock *ParentBlock = NextMI->getParent(); + MachineInstrBuilder MIB; + if (IsLoad) { // Insert absolute-set load instruction + ++HexagonNumLoadAbsConversions; + MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(), + TII->get(NewOpc), LoadStoreReg) + .addReg(DstReg, RegState::Define); + } else { // Insert absolute-set store instruction + ++HexagonNumStoreAbsConversions; + MIB = BuildMI(*ParentBlock, NextMI, NextMI->getDebugLoc(), + TII->get(NewOpc), DstReg); + } + + MachineOperand ImmOperand = MI->getOperand(1); + if (IsGlobal) + MIB.addGlobalAddress(ImmOperand.getGlobal(), ImmOperand.getOffset(), + ImmOperand.getTargetFlags()); + else + MIB.addImm(ImmOperand.getImm()); + + if (IsLoad) + MIB->getOperand(0).setSubReg(MO0.getSubReg()); + else + MIB.addReg(LoadStoreReg, 0, MO0.getSubReg()); + + LLVM_DEBUG(dbgs() << "Replaced with " << *MIB << "\n"); + // Erase the instructions that got replaced. + MII = MBB->erase(MI); + --MII; + NextMI->getParent()->erase(NextMI); + } + } + + return true; +} + +bool HexagonGenMemAbsolute::isValidIndexedLoad(int &Opc, int &NewOpc) { + + bool Result = true; + switch (Opc) { + case Hexagon::L2_loadrb_io: + NewOpc = Hexagon::L4_loadrb_ap; + break; + case Hexagon::L2_loadrh_io: + NewOpc = Hexagon::L4_loadrh_ap; + break; + case Hexagon::L2_loadri_io: + NewOpc = Hexagon::L4_loadri_ap; + break; + case Hexagon::L2_loadrd_io: + NewOpc = Hexagon::L4_loadrd_ap; + break; + case Hexagon::L2_loadruh_io: + NewOpc = Hexagon::L4_loadruh_ap; + break; + case Hexagon::L2_loadrub_io: + NewOpc = Hexagon::L4_loadrub_ap; + break; + default: + Result = false; + } + + return Result; +} + +bool HexagonGenMemAbsolute::isValidIndexedStore(int &Opc, int &NewOpc) { + + bool Result = true; + switch (Opc) { + case Hexagon::S2_storerd_io: + NewOpc = Hexagon::S4_storerd_ap; + break; + case Hexagon::S2_storeri_io: + NewOpc = Hexagon::S4_storeri_ap; + break; + case Hexagon::S2_storerh_io: + NewOpc = Hexagon::S4_storerh_ap; + break; + case Hexagon::S2_storerb_io: + NewOpc = Hexagon::S4_storerb_ap; + break; + default: + Result = false; + } + + return Result; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonGenMemAbsolute() { + return new HexagonGenMemAbsolute(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 7d4b420071c4a..49ef547d65fb2 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -92,6 +92,10 @@ static cl::opt static cl::opt DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden, cl::desc("Disable splitting double registers")); +static cl::opt + EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden, + cl::desc("Generate absolute set instructions")); + static cl::opt EnableBitSimplify("hexagon-bit", cl::init(true), cl::Hidden, cl::desc("Bit simplification")); @@ -151,6 +155,7 @@ namespace llvm { void initializeHexagonCopyToCombinePass(PassRegistry&); void initializeHexagonEarlyIfConversionPass(PassRegistry&); void initializeHexagonExpandCondsetsPass(PassRegistry&); + void initializeHexagonGenMemAbsolutePass(PassRegistry &); void initializeHexagonGenMuxPass(PassRegistry&); void initializeHexagonHardwareLoopsPass(PassRegistry&); void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &); @@ -177,6 +182,7 @@ namespace llvm { FunctionPass *createHexagonFixupHwLoops(); FunctionPass *createHexagonGenExtract(); FunctionPass *createHexagonGenInsert(); + FunctionPass *createHexagonGenMemAbsolute(); FunctionPass *createHexagonGenMux(); FunctionPass *createHexagonGenPredicate(); FunctionPass *createHexagonHardwareLoops(); @@ -211,6 +217,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() { initializeHexagonConstPropagationPass(PR); initializeHexagonCopyToCombinePass(PR); initializeHexagonEarlyIfConversionPass(PR); + initializeHexagonGenMemAbsolutePass(PR); initializeHexagonGenMuxPass(PR); initializeHexagonHardwareLoopsPass(PR); initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR); @@ -413,6 +420,8 @@ void HexagonPassConfig::addPreRegAlloc() { insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID); if (!DisableStoreWidening) addPass(createHexagonStoreWidening()); + if (EnableGenMemAbs) + addPass(createHexagonGenMemAbsolute()); if (!DisableHardwareLoops) addPass(createHexagonHardwareLoops()); } diff --git a/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll b/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll new file mode 100644 index 0000000000000..6f9e83c23ab32 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=hexagon -O3 -hexagon-small-data-threshold=0 < %s | FileCheck %s +; This test checks the case if there are more than 2 uses of a constan address, move the +; value in to a register and replace all instances of constant with the register. +; The GenMemAbsolute pass generates a absolute-set instruction if there are more +; than 2 uses of this register. + +; CHECK: loadi32_3 +; CHECK-NOT: r{{[0-9]+}} = memw(##441652) +; CHECK-NOT: r{{[0-9]+}} = memw(r{{[0-9]+}}+#0) +; CHECK:r{{[0-9]+}} = memw(r[[REG:[0-9]+]]=##441652) +; CHECK-NOT: r{{[0-9]+}} = {emw(##441652) +; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0) +; CHECK-NOT: r{{[0-9]+}} = memw(##441652) +; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0) +; CHECK-NOT: r{{[0-9]+}} = memw(##441652) + +define void @loadi32_3() #0 { +entry: + %0 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4 + %1 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4 + %2 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4 + ret void +} + +; CHECK: loadi32_2 +; CHECK-NOT: r{{[0-9]+}} = ##441652 +; CHECK: r{{[0-9]+}} = memw(##441652) +; CHECK: r{{[0-9]+}} = memw(##441652) + +define void @loadi32_2() #0 { +entry: + %0 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4 + %1 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4 + ret void +} + +; CHECK: loadi32_abs_global_3 +; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt) +; CHECK-NOT: r{{[0-9]+}} = memw(r{{[0-9]+}}+#0) +; CHECK:r{{[0-9]+}} = memw(r[[REG:[0-9]+]]=##globalInt) +; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt) +; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0) +; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt) +; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0) +; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt) + +@globalInt = external global i32, align 8 +define void @loadi32_abs_global_3() #0 { +entry: + %0 = load volatile i32, ptr @globalInt, align 4 + %1 = load volatile i32, ptr @globalInt, align 4 + %2 = load volatile i32, ptr @globalInt, align 4 + ret void +} + +; CHECK: loadi32_abs_global_2 +; CHECK-NOT:r[[REG:[0-9]+]] = ##globalInt +; CHECK:r{{[0-9]+}} = memw(##globalInt) +; CHECK:r{{[0-9]+}} = memw(##globalInt) + +define void @loadi32_abs_global_2() #0 { +entry: + %0 = load volatile i32, ptr @globalInt, align 4 + %1 = load volatile i32, ptr @globalInt, align 4 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll b/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll new file mode 100644 index 0000000000000..dccf176f3bd07 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll @@ -0,0 +1,72 @@ +; RUN: llc -march=hexagon -O3 -hexagon-small-data-threshold=0 < %s | FileCheck %s +; This test checks the case if there are more than 2 uses of a constan address, move the +; value in to a register and replace all instances of constant with the register. +; The GenMemAbsolute pass generates a absolute-set instruction if there are more +; than 2 uses of this register. + +; CHECK: storetrunci32_3 +; CHECK-NOT: memw(##441652) = r{{[0-9]+}} +; CHECK-NOT: memw(r{{[0-9]+}}+#0) = r{{[0-9]+}} +; CHECK:memw(r[[REG:[0-9]+]]=##441652) = r{{[0-9]+}} +; CHECK-NOT: memw(##441652) = r{{[0-9]+}} +; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}} +; CHECK-NOT: memw(##441652) = r{{[0-9]+}} +; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}} +; CHECK-NOT: memw(##441652) = r{{[0-9]+}} + +define void @storetrunci32_3(i64 %descr_addr, i32 %rpm_or_sys, i32 %kkr) #0 { +entry: + %conv = trunc i64 %descr_addr to i32 + store volatile i32 %conv, ptr inttoptr (i32 441652 to ptr), align 4 + store volatile i32 %rpm_or_sys, ptr inttoptr (i32 441652 to ptr), align 4 + store volatile i32 %kkr, ptr inttoptr (i32 441652 to ptr), align 4 + ret void +} + +; CHECK: storetrunci32_2 +; CHECK-NOT: r{{[0-9]+}} = ##441652 +; CHECK: memw(##441652) = r{{[0-9]+}} +; CHECK: memw(##441652) = r{{[0-9]+}} + +define void @storetrunci32_2(i64 %descr_addr, i32 %rpm_or_sys) #0 { +entry: + %conv = trunc i64 %descr_addr to i32 + store volatile i32 %conv, ptr inttoptr (i32 441652 to ptr), align 4 + store volatile i32 %rpm_or_sys, ptr inttoptr (i32 441652 to ptr), align 4 + ret void +} + +; CHECK: storetrunci32_abs_global_3 +; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}} +; CHECK-NOT: memw(r{{[0-9]+}}+#0) = r{{[0-9]+}} +; CHECK:memw(r[[REG:[0-9]+]]=##globalInt) = r{{[0-9]+}} +; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}} +; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}} +; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}} +; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}} +; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}} + +@globalInt = external global i32, align 8 +define void @storetrunci32_abs_global_3(i64 %descr_addr, i32 %rpm_or_sys, i32 %kkr) #0 { +entry: + %conv = trunc i64 %descr_addr to i32 + store volatile i32 %conv, ptr @globalInt, align 4 + store volatile i32 %rpm_or_sys, ptr @globalInt, align 4 + store volatile i32 %kkr, ptr @globalInt, align 4 + ret void +} + +; CHECK: storetrunci32_abs_global_2 +; CHECK-NOT:r[[REG:[0-9]+]] = ##globalInt +; CHECK:memw(##globalInt) = r{{[0-9]+}} +; CHECK:memw(##globalInt) = r{{[0-9]+}} + +define void @storetrunci32_abs_global_2(i64 %descr_addr, i32 %rpm_or_sys) #0 { +entry: + %conv = trunc i64 %descr_addr to i32 + store volatile i32 %conv, ptr @globalInt, align 4 + store volatile i32 %rpm_or_sys, ptr @globalInt, align 4 + ret void +} + +attributes #0 = { nounwind } From d62ca8def395ac165f253fdde1d93725394a4d53 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Wed, 21 Feb 2024 19:50:47 -0600 Subject: [PATCH 166/351] [Hexagon] Optimize post-increment load and stores in loops. (#82418) This patch optimizes the post-increment instructions so that we can packetize them together. v1 = phi(v0, v3') v2,v3 = post_load v1, 4 v2',v3'= post_load v3, 4 This can be optimized in two ways v1 = phi(v0, v3') v2,v3' = post_load v1, 8 v2' = load v1, 4 --- llvm/lib/Target/Hexagon/CMakeLists.txt | 1 + llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp | 56 ++ llvm/lib/Target/Hexagon/HexagonInstrInfo.h | 2 + llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp | 689 ++++++++++++++++++ .../Target/Hexagon/HexagonTargetMachine.cpp | 13 + .../Hexagon/MCTargetDesc/HexagonBaseInfo.h | 12 +- llvm/test/CodeGen/Hexagon/post-inc-vec.mir | 413 +++++++++++ llvm/test/CodeGen/Hexagon/post_inc_store.mir | 168 +++++ .../test/CodeGen/Hexagon/postincopt-crash.mir | 58 ++ .../CodeGen/Hexagon/postincopt-dcfetch.mir | 19 + .../CodeGen/Hexagon/valid-offset-loadbsw4.mir | 32 + 11 files changed, 1462 insertions(+), 1 deletion(-) create mode 100644 llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp create mode 100644 llvm/test/CodeGen/Hexagon/post-inc-vec.mir create mode 100644 llvm/test/CodeGen/Hexagon/post_inc_store.mir create mode 100644 llvm/test/CodeGen/Hexagon/postincopt-crash.mir create mode 100644 llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir create mode 100644 llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index 753f3dcc88e19..19ccd770f071d 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -51,6 +51,7 @@ add_llvm_target(HexagonCodeGen HexagonOptAddrMode.cpp HexagonOptimizeSZextends.cpp HexagonPeephole.cpp + HexagonPostIncOpt.cpp HexagonRDFOpt.cpp HexagonRegisterInfo.cpp HexagonSelectionDAGInfo.cpp diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 619c7dc69f9b2..91cc9307786b6 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1655,6 +1655,13 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const { return getAddrMode(MI) == HexagonII::PostInc; } +bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const { + unsigned BasePos, OffsetPos; + if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return false; + return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm(); +} + // Returns true if an instruction is predicated irrespective of the predicate // sense. For example, all of the following will return true. // if (p0) R1 = add(R2, R3) @@ -2436,6 +2443,55 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const { Opcode == Hexagon::J2_loop1rext; } +bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return false; + case Hexagon::L2_loadalignb_pci: + case Hexagon::L2_loadalignb_pcr: + case Hexagon::L2_loadalignh_pci: + case Hexagon::L2_loadalignh_pcr: + case Hexagon::L2_loadbsw2_pci: + case Hexagon::L2_loadbsw2_pcr: + case Hexagon::L2_loadbsw4_pci: + case Hexagon::L2_loadbsw4_pcr: + case Hexagon::L2_loadbzw2_pci: + case Hexagon::L2_loadbzw2_pcr: + case Hexagon::L2_loadbzw4_pci: + case Hexagon::L2_loadbzw4_pcr: + case Hexagon::L2_loadrb_pci: + case Hexagon::L2_loadrb_pcr: + case Hexagon::L2_loadrd_pci: + case Hexagon::L2_loadrd_pcr: + case Hexagon::L2_loadrh_pci: + case Hexagon::L2_loadrh_pcr: + case Hexagon::L2_loadri_pci: + case Hexagon::L2_loadri_pcr: + case Hexagon::L2_loadrub_pci: + case Hexagon::L2_loadrub_pcr: + case Hexagon::L2_loadruh_pci: + case Hexagon::L2_loadruh_pcr: + case Hexagon::S2_storerbnew_pci: + case Hexagon::S2_storerbnew_pcr: + case Hexagon::S2_storerb_pci: + case Hexagon::S2_storerb_pcr: + case Hexagon::S2_storerd_pci: + case Hexagon::S2_storerd_pcr: + case Hexagon::S2_storerf_pci: + case Hexagon::S2_storerf_pcr: + case Hexagon::S2_storerhnew_pci: + case Hexagon::S2_storerhnew_pcr: + case Hexagon::S2_storerh_pci: + case Hexagon::S2_storerh_pcr: + case Hexagon::S2_storerinew_pci: + case Hexagon::S2_storerinew_pcr: + case Hexagon::S2_storeri_pci: + case Hexagon::S2_storeri_pcr: + return true; + } + return false; +} + bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return false; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index e496995d3ff12..65783c560321a 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -434,6 +434,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const; bool PredOpcodeHasJMP_c(unsigned Opcode) const; bool predOpcodeHasNot(ArrayRef Cond) const; + bool isPostIncWithImmOffset(const MachineInstr &MI) const; + bool isCircBufferInstr(const MachineInstr &MI) const; unsigned getAddrMode(const MachineInstr &MI) const; MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset, diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp new file mode 100644 index 0000000000000..4c845f24f76a9 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp @@ -0,0 +1,689 @@ +//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Convert post-inc addressing mode into base-offset addressing mode. +// Ex: +// original loop: +// v1 = phi(v0, v3) +// v2,v3 = post_load v1, 4 + +// Often, unroller creates below form of post-increments: +// v1 = phi(v0, v3') +// v2,v3 = post_load v1, 4 +// v2',v3'= post_load v3, 4 + +// This can be optimized in two ways + +// 1. +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2' = load v3', -4 +// +// 2. +// v1 = phi(v0, v3') +// v2,v3' = post_load v1, 8 +// v2' = load v1, 4 +// +// Option 2 is favored as we can packetize two memory operations in a single +// packet. However, this is not always favorable due to memory dependences +// and in cases where we form a bigger chain of post-increment ops that will +// create more spills as we can not execute post-increment ops with out +// executing base-offset instructions. +//===----------------------------------------------------------------------===// +#include "HexagonInstrInfo.h" +#include "HexagonSubtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagon-postincopt" + +static cl::opt PostIncChainThreshold( + "post-inc-chain-threshold", cl::Hidden, cl::init(4), + cl::desc("Limit the number of post-inc instructions in a chain.")); + +static cl::opt PreferPostIncStore( + "prefer-post-inc-store", cl::Hidden, cl::init(true), + cl::desc("Prefer post-inc store in a list of loads and stores.")); + +namespace llvm { +void initializeHexagonPostIncOptPass(PassRegistry &); +FunctionPass *createHexagonPostIncOpt(); +} // namespace llvm + +namespace { + +class HexagonPostIncOpt : public MachineFunctionPass { + MachineLoopInfo *MLI = nullptr; + const HexagonInstrInfo *HII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const HexagonSubtarget *HST = nullptr; + +public: + static char ID; + + HexagonPostIncOpt() : MachineFunctionPass(ID) { + initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; } + + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + bool translatePostIncsInLoop(MachineBasicBlock &MBB); + void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const; + void replacePostIncWithBaseOffset(MachineInstr &MI) const; + bool isPostIncInsn(MachineInstr &MI) const; + void foldAdds(MachineBasicBlock &MBB) const; + void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const; + void removeDeadInstructions(MachineBasicBlock &MBB) const; + + void generatePostInc(MachineBasicBlock &MBB); + bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const; + + bool isValidOffset(const MachineInstr &MI, int64_t Offset) const; + bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const; +}; + +class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs { + HexagonPostIncOpt &Pass; + +public: + HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF, + MachineLoopInfo *MLI) + : ScheduleDAGInstrs(MF, MLI, false), Pass(P){}; + void schedule() override; + ScheduleDAGTopologicalSort &getTopo() { return Topo; }; +}; + +} // End anonymous namespace. + +char HexagonPostIncOpt::ID = 0; + +INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE, + "Hexagon Post-Inc-Opt Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass", + false, false) + +/// Return true if MIA dominates MIB. +static bool dominates(MachineInstr *MIA, MachineInstr *MIB) { + if (MIA->getParent() != MIB->getParent()) + return false; // Don't know since machine dominator tree is out of date. + + MachineBasicBlock *MBB = MIA->getParent(); + MachineBasicBlock::iterator I = MBB->instr_begin(); + // Iterate over the basic block until MIA or MIB is found. + for (; &*I != MIA && &*I != MIB; ++I) + ; + + // MIA dominates MIB if MIA is found first. + return &*I == MIA; +} + +// Return the Phi register value that comes from the loop block. +static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) { + for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) + if (Phi->getOperand(i + 1).getMBB() == LoopBB) + return Phi->getOperand(i).getReg(); + return UINT_MAX; +} + +static bool isAddWithImmValue(const MachineInstr &MI) { + // FIXME: For now, only deal with adds that have strict immediate values. + // Some A2_addi instructions can be of the form. + // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16 + return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm(); +} + +// Compute the number of 'real' instructions in the basic block by +// ignoring terminators. +static unsigned getBasicBlockSize(MachineBasicBlock &MBB) { + unsigned size = 0; + for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator())) + if (!I.isDebugInstr()) + size++; + return size; +} + +// Setup Post increment Schedule DAG. +static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG, + MachineBasicBlock &MBB) { + PIDAG.startBlock(&MBB); + PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(), + getBasicBlockSize(MBB)); + // Build the graph. + PIDAG.schedule(); + // exitRegion() is an empty function in base class. So, safe to call it here. + PIDAG.exitRegion(); +} + +// Check if post-increment candidate has any memory dependence on any +// instruction in the chain. +static bool hasMemoryDependency(SUnit *PostIncSU, + SmallVector &UseList) { + + // FIXME: Fine tune the order dependence. Probably can only consider memory + // related OrderKind. + for (auto &Dep : PostIncSU->Succs) + if (Dep.getKind() == SDep::Order) + if (std::find(UseList.begin(), UseList.end(), + Dep.getSUnit()->getInstr()) != UseList.end()) + return true; + + return false; +} + +// Fold an add with immediate into either an add or a load or a store. +void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const { + LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n"); + for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) { + if (!isAddWithImmValue(MI)) + continue; + unsigned DefReg = MI.getOperand(0).getReg(); + unsigned AddReg = MI.getOperand(1).getReg(); + int64_t AddImm = MI.getOperand(2).getImm(); + + SmallVector UseList; + // Gather the uses of add instruction's def reg. + for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) { + MachineInstr *UseMI = MO.getParent(); + // Deal with only the instuctions that belong to this block. + // If we cross this block, the generation of post-increment logic + // will not be able to transform to post-inc due to dominance. + if (UseMI->getParent() == &MBB) + UseList.push_back(UseMI); + } + + if (UseList.empty()) + continue; + + LLVM_DEBUG({ + dbgs() << "Current instruction considered for folding \n"; + MI.dump(); + }); + + for (auto UseMI : UseList) { + if (isAddWithImmValue(*UseMI)) { + int64_t NewImm = AddImm + UseMI->getOperand(2).getImm(); + // Fold if the new immediate is with in the range. + if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) { + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is folded in to \n"; + }); + UseMI->getOperand(1).setReg(AddReg); + UseMI->getOperand(2).setImm(NewImm); + LLVM_DEBUG(UseMI->dump()); + } + } else if (HII->isBaseImmOffset(*UseMI)) { + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is folded in to \n"; + }); + updateBaseAndOffset(*UseMI, MI); + LLVM_DEBUG(UseMI->dump()); + } + LLVM_DEBUG(dbgs() << "\n"); + } + } + removeDeadInstructions(MBB); + LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n"); +} + +void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI, + MachineInstr &AddMI) const { + assert(HII->isBaseImmOffset(MI)); + unsigned BasePos, OffsetPos; + if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return; + + MachineOperand &OffsetOp = MI.getOperand(OffsetPos); + MachineOperand &BaseOp = MI.getOperand(BasePos); + + if (BaseOp.getReg() != AddMI.getOperand(0).getReg()) + return; + + unsigned IncBase = AddMI.getOperand(1).getReg(); + int64_t IncValue = AddMI.getOperand(2).getImm(); + + int64_t NewOffset = OffsetOp.getImm() + IncValue; + if (!isValidOffset(MI, NewOffset)) + return; + + OffsetOp.setImm(NewOffset); + BaseOp.setReg(IncBase); +} + +void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const { + // For MBB, check that the value defined by each instruction is used. + // If not, delete it. + for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(), + ME = MBB.instr_rend(); + MI != ME;) { + // From DeadMachineInstructionElem. Don't delete inline assembly. + if (MI->isInlineAsm()) { + ++MI; + continue; + } + bool SawStore = false; + // Check if it's safe to remove the instruction due to side effects. + if (!MI->isSafeToMove(nullptr, SawStore)) { + ++MI; + continue; + } + unsigned Uses = 0; + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); + MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef()) + continue; + unsigned reg = MOI->getReg(); + // Assume physical registers are used. + if (Register::isPhysicalRegister(reg)) { + Uses++; + continue; + } + if (MRI->use_begin(reg) != MRI->use_end()) + Uses++; + } + if (!Uses) { + MI++->eraseFromParent(); + continue; + } + ++MI; + } +} + +bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const { + // Predicated post-increments are not yet handled. (ISel is not generating + // them yet). Circular buffer instructions should not be handled. + return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) && + !HII->isCircBufferInstr(MI)); +} + +/// For instructions with a base and offset, return true if the new Offset +/// is a valid value with the correct alignment. +bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI, + int64_t Offset) const { + if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false)) + return false; + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + return (Offset & AlignMask) == 0; +} + +bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI, + int IncVal) const { + unsigned AlignMask = HII->getMemAccessSize(MI) - 1; + if ((IncVal & AlignMask) != 0) + return false; + + // Number of total bits in the instruction used to encode Inc value. + unsigned IncBits = 4; + // For HVX instructions, the offset is 3. + if (HexagonII::isCVI(MI.getDesc())) + IncBits = 3; + + IncBits += Log2_32(HII->getMemAccessSize(MI)); + if (HII->getMemAccessSize(MI) > 8) + IncBits = 16; + + int MinValidVal = -1U << (IncBits - 1); + int MaxValidVal = ~(-1U << (IncBits - 1)); + return (IncVal >= MinValidVal && IncVal <= MaxValidVal); +} + +void HexagonPostIncOptSchedDAG::schedule() { + AliasAnalysis *AA = &Pass.getAnalysis().getAAResults(); + buildSchedGraph(AA); +} + +// Replace post-increment operations with base+offset counterpart. +void HexagonPostIncOpt::replacePostIncWithBaseOffset( + MachineBasicBlock &MBB) const { + LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with " + "base+offset counterparts.\n"); + + SmallVector MIList; + for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) { + // Check for eligible post-inc candidates. + if (!isPostIncInsn(MI)) + continue; + MIList.push_back(&MI); + } + + for (auto MI : MIList) + replacePostIncWithBaseOffset(*MI); + + LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n"); +} + +void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const { + short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode()); + if (NewOpcode < 0) + return; + + unsigned BasePos = 0, OffsetPos = 0; + if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos)) + return; + const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos); + const MachineOperand &PostIncBase = MI.getOperand(BasePos); + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineOperand *PostIncDest; + MachineInstrBuilder MIB; + if (MI.mayLoad()) { + PostIncDest = &MI.getOperand(1); + const MachineOperand &LDValue = MI.getOperand(0); + MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(PostIncBase).addImm(0); + } else { + PostIncDest = &MI.getOperand(0); + const MachineOperand &STValue = MI.getOperand(3); + MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode)); + MIB.add(PostIncBase).addImm(0).add(STValue); + } + + // Transfer memoperands. + MIB->cloneMemRefs(*MBB.getParent(), MI); + + // Create an add instruction for the post-inc addition of offset. + MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi)); + MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset); + + LLVM_DEBUG({ + dbgs() << "\n"; + MI.dump(); + dbgs() << "\tis tranformed to \n"; + MIB->dump(); + MIBA->dump(); + dbgs() << "\n\n"; + }); + + MI.eraseFromParent(); +} + +void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) { + LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n"); + MachineBasicBlock::iterator MII = MBB.getFirstNonPHI(); + MachineBasicBlock::iterator MIE = MBB.instr_begin(); + bool isOK = true; + while (MII != MIE) { + MachineInstr *Phi = &*std::prev(MII); + MII = std::prev(MII); + unsigned LoopVal = getLoopPhiReg(Phi, &MBB); + if (LoopVal == UINT_MAX) + continue; + MachineInstr *LoopInst = MRI->getVRegDef(LoopVal); + if (!isAddWithImmValue(*LoopInst)) + continue; + + if (LoopInst->getOpcode() != Hexagon::A2_addi) + continue; + + unsigned AddReg = LoopInst->getOperand(1).getReg(); + int64_t AddImm = LoopInst->getOperand(2).getImm(); + SmallVector UseList; + MachineInstr *PostIncCandidate = nullptr; + + // Find the probable candidates for Post-increment instruction. + SmallVector CandList; + for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) { + MachineInstr *UseMI = MO.getParent(); + + if (UseMI == LoopInst) + continue; + + if (!dominates(UseMI, LoopInst)) { + isOK = false; + break; + } + const MachineOperand *BaseOp = nullptr; + int64_t Offset; + bool OffsetIsScalable; + if (!HII->isBaseImmOffset(*UseMI) || + !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset, + OffsetIsScalable, TRI)) { + isOK = false; + break; + } + int64_t NewOffset = Offset - AddImm; + if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() || + BaseOp->getReg() != AddReg) { + isOK = false; + break; + } + if (OffsetIsScalable) { + isOK = false; + break; + } + if (Offset == 0) { + // If you have stores in the chain, make sure they are in the beginning + // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST, + // ST. + if (UseMI->mayStore() && PreferPostIncStore) + CandList.insert(CandList.begin(), UseMI); + else + CandList.push_back(UseMI); + continue; + } + UseList.push_back(UseMI); + } + + if (!isOK) + continue; + + for (auto MI : CandList) { + if (!PostIncCandidate) + PostIncCandidate = MI; + // Push the rest of the list for updation. + else + UseList.push_back(MI); + } + + // If a candidate is found, replace it with the post-inc instruction. + // Also, adjust offset for other uses as needed. + if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst)) + continue; + + // Logic to determine what the base register to be. + // There are two choices: + // 1. New address register after we updated the post-increment candidate. + // v2,v3 = post_load v1, 4 + // v3 is the choice here. + // 2. The base register we used in post-increment candidate. + // v2,v3 = post_load v1, 4 + // v1 is the choice here. + // Use v3 if there is a memory dependence between post-inc instruction and + // any other instruction in the chain. + // FIXME: We can do some complex DAG analysis based off height and depth and + // selectively update other instructions in the chain. Use v3 if there are + // more instructions in the chain, otherwise we will end up increasing the + // height of the DAG resulting in more spills. By default we have a + // threshold controlled by the option "post-inc-chain-threshold" which is + // set to 4. v1 is preferred as we can packetize two memory operations in a + // single packet in scalar core. But it heavily depends on the structure of + // DAG. + bool UpdateBaseToNew = false; + + // Do not bother to build a DAG and analyze if the Use list is empty. + if (!UseList.empty()) { + MachineFunction *MF = MBB.getParent(); + // Setup the Post-inc schedule DAG. + HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI); + initPISchedDAG(PIDAG, MBB); + SUnit *SU = PIDAG.getSUnit(PostIncCandidate); + if (hasMemoryDependency(SU, UseList) || + UseList.size() >= PostIncChainThreshold) + UpdateBaseToNew = true; + } + + if (UpdateBaseToNew) { + LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the " + "base register of post-increment\n"); + for (auto UseMI : UseList) { + if (!dominates(PostIncCandidate, UseMI)) + continue; + unsigned BasePos, OffsetPos; + if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) { + // New offset has already been validated; no need to do it again. + LLVM_DEBUG({ + UseMI->dump(); + dbgs() << "\t is transformed to \n"; + }); + int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm; + UseMI->getOperand(OffsetPos).setImm(NewOffset); + UseMI->getOperand(BasePos).setReg(LoopVal); + LLVM_DEBUG(UseMI->dump()); + } + } + } + replaceWithPostInc(PostIncCandidate, LoopInst); + } + LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n"); +} + +bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0) + return false; + assert(AddMI->getOpcode() == Hexagon::A2_addi); + return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm()); +} + +void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI, + MachineInstr *AddMI) const { + short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode()); + assert(NewOpcode >= 0 && + "Couldn't change base offset to post-increment form"); + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + const MachineOperand &IncDest = AddMI->getOperand(0); + const MachineOperand &IncBase = AddMI->getOperand(1); + const MachineOperand &IncValue = AddMI->getOperand(2); + MachineInstrBuilder MIB; + LLVM_DEBUG({ + dbgs() << "\n\n"; + MI->dump(); + dbgs() << "\t is tranformed to post-inc form of \n"; + }); + + if (MI->mayLoad()) { + const MachineOperand &LDValue = MI->getOperand(0); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue); + } else { + const MachineOperand &STValue = MI->getOperand(2); + MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode)); + MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue); + } + + // Transfer memoperands. + MIB->cloneMemRefs(*MBB.getParent(), *MI); + + LLVM_DEBUG({ + MIB->dump(); + dbgs() << "As a result this add instruction is erased.\n"; + AddMI->dump(); + }); + + MI->eraseFromParent(); + AddMI->eraseFromParent(); +} + +bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) { + // Algorithm: + // 1. Replace all the post-inc instructions with Base+Offset instruction and + // an add instruction in this block. + // 2. Fold all the adds in to respective uses. + // 3. Generate post-increment instructions and update the uses of the base + // register if needed based on constraints. + + replacePostIncWithBaseOffset(MBB); + foldAdds(MBB); + generatePostInc(MBB); + return true; +} + +bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) { + + // Skip pass if requested. + if (skipFunction(MF.getFunction())) + return false; + + // Get Target Information. + MLI = &getAnalysis(); + HST = &MF.getSubtarget(); + TRI = HST->getRegisterInfo(); + MRI = &MF.getRegInfo(); + HII = HST->getInstrInfo(); + + // Skip this pass for TinyCore. + // Tiny core allwos partial post increment operations - This constraint can + // be imposed inside the pass. In a chain of post-increments, the first can + // be post-increment, rest can be adjusted to base+offset (these are + // inexpensive in most of the cases); + if (HST->isTinyCore()) + return false; + + LLVM_DEBUG({ + dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n"; + dbgs() << "Function: " << MF.getName() << "\n"; + }); + bool Change = false; + std::vector MLBB; + for (auto &BB : MF) { + // Check if this Basic Block belongs to any loop. + auto *LI = MLI->getLoopFor(&BB); + // We only deal with inner-most loops that has one block. + if (LI && LI->getBlocks().size() == 1) { + MachineBasicBlock *MBB = LI->getHeader(); + // Do not traverse blocks that are already visited. + if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end()) + continue; + + MLBB.push_back(MBB); + + LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n"); + Change |= translatePostIncsInLoop(*MBB); + } + } + LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n"); + return Change; +} + +FunctionPass *llvm::createHexagonPostIncOpt() { + return new HexagonPostIncOpt(); +} diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 49ef547d65fb2..f640f76bc47b8 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -125,6 +125,10 @@ static cl::opt EnableInstSimplify("hexagon-instsimplify", cl::Hidden, cl::init(true), cl::desc("Enable instsimplify")); +static cl::opt DisableHexagonPostIncOpt( + "hexagon-postinc-opt", cl::Hidden, + cl::desc("Disable Hexagon post-increment optimization")); + /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the /// library. In particular, it seems that it is not possible to get @@ -162,6 +166,7 @@ namespace llvm { void initializeHexagonNewValueJumpPass(PassRegistry&); void initializeHexagonOptAddrModePass(PassRegistry&); void initializeHexagonPacketizerPass(PassRegistry&); + void initializeHexagonPostIncOptPass(PassRegistry &); void initializeHexagonRDFOptPass(PassRegistry&); void initializeHexagonSplitDoubleRegsPass(PassRegistry&); void initializeHexagonVExtractPass(PassRegistry &); @@ -194,6 +199,7 @@ namespace llvm { FunctionPass *createHexagonOptimizeSZextends(); FunctionPass *createHexagonPacketizer(bool Minimal); FunctionPass *createHexagonPeephole(); + FunctionPass *createHexagonPostIncOpt(); FunctionPass *createHexagonRDFOpt(); FunctionPass *createHexagonSplitConst32AndConst64(); FunctionPass *createHexagonSplitDoubleRegs(); @@ -224,6 +230,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() { initializeHexagonNewValueJumpPass(PR); initializeHexagonOptAddrModePass(PR); initializeHexagonPacketizerPass(PR); + initializeHexagonPostIncOptPass(PR); initializeHexagonRDFOptPass(PR); initializeHexagonSplitDoubleRegsPass(PR); initializeHexagonVectorCombineLegacyPass(PR); @@ -251,6 +258,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, (HexagonNoOpt ? CodeGenOptLevel::None : OL)), TLOF(std::make_unique()) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); + initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); initAsmInfo(); } @@ -425,6 +433,11 @@ void HexagonPassConfig::addPreRegAlloc() { if (!DisableHardwareLoops) addPass(createHexagonHardwareLoops()); } + + if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive) + if (!DisableHexagonPostIncOpt) + addPass(createHexagonPostIncOpt()); + if (TM->getOptLevel() >= CodeGenOptLevel::Default) addPass(&MachinePipelinerID); } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index ca982696b0600..98404121bda02 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -18,6 +18,7 @@ #include "HexagonDepITypes.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "llvm/MC/MCInstrDesc.h" namespace llvm { @@ -48,7 +49,7 @@ namespace HexagonII { // MCInstrDesc TSFlags // *** Must match HexagonInstrFormat*.td *** - enum { + enum HexagonTSFlagsVal { // This 7-bit field describes the insn type. TypePos = 0, TypeMask = 0x7f, @@ -173,6 +174,11 @@ namespace HexagonII { hasUnaryRestrictionMask = 0x1, }; + inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos, + unsigned Mask) { + return (MID.TSFlags >> Pos) & Mask; + } + // *** The code above must match HexagonInstrFormat*.td *** // // Hexagon specific MO operand flag mask. @@ -275,6 +281,10 @@ namespace HexagonII { INST_ICLASS_ALU32_3 = 0xf0000000 }; + inline bool isCVI(const MCInstrDesc &MID) { + return getTSFlags(MID, isCVIPos, isCVIMask) != 0; + } + LLVM_ATTRIBUTE_UNUSED static unsigned getMemAccessSizeInBytes(MemAccessSize S) { switch (S) { diff --git a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir new file mode 100644 index 0000000000000..3788dc3fecd89 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir @@ -0,0 +1,413 @@ +#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s + +# Test that we do not generate two post-increment vector load/store +# in the loop. +# CHECK: J2_loop0r +# CHECK: V6_vS32b_pi +# CHECK-NOT: = V6_vL32b_pi +# CHECK: V6_vL32b_ai +# CHECK: V6_vL32b_ai +# CHECK: V6_vS32b_ai +# CHECK: ENDLOOP0 + +--- | + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2v2.128B(i32) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write) + declare void @llvm.hexagon.V6.vS32b.qpred.ai.128B(<1024 x i1>, ptr, <32 x i32>) #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32>, <32 x i32>, i32) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32>, <32 x i32>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32>, <64 x i32>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) + declare void @llvm.assume(i1 noundef) #2 + + ; Function Attrs: noinline nounwind + define void @blah(i32 %0, i32 %1, ptr noalias %2, ptr noalias nocapture readonly %3, ptr noalias nocapture readonly %4, ptr nocapture readnone %5, ptr nocapture readnone %6, i32 %7, i32 %8, ptr nocapture readonly %9, ptr nocapture readonly %10) local_unnamed_addr #3 { + entry: + %11 = call i32 @llvm.hexagon.S2.extractu(i32 %0, i32 23, i32 9) + %12 = shl i32 %11, 7 + %mul16.i = mul nsw i32 %12, %1 + %add.i = add nsw i32 %1, 1 + %mul17.i = mul nsw i32 %add.i, %12 + %cmp184.i = icmp slt i32 %mul16.i, %mul17.i + br i1 %cmp184.i, label %for.body.lr.ph.i, label %for.end.i + + for.body.lr.ph.i: ; preds = %entry + %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> , <32 x i32> ) #5 + %14 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> zeroinitializer, <32 x i32> zeroinitializer) #5 + %15 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32) #5 + %cgep = getelementptr i8, ptr %2, i32 %mul16.i + %cgep8 = getelementptr i8, ptr %4, i32 %mul16.i + %cgep9 = getelementptr i8, ptr %3, i32 %mul16.i + br label %for.body.i + + for.body.i: ; preds = %for.body.i, %for.body.lr.ph.i + %lsr.iv6 = phi ptr [ %cgep12, %for.body.i ], [ %cgep9, %for.body.lr.ph.i ] + %lsr.iv3 = phi ptr [ %cgep11, %for.body.i ], [ %cgep8, %for.body.lr.ph.i ] + %lsr.iv = phi ptr [ %cgep10, %for.body.i ], [ %cgep, %for.body.lr.ph.i ] + %elemIdx.05.i = phi i32 [ %mul16.i, %for.body.lr.ph.i ], [ %add19.i, %for.body.i ] + %16 = load <32 x i32>, ptr %lsr.iv6, align 128 + %17 = load <32 x i32>, ptr %lsr.iv3, align 128 + %18 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %17, <32 x i32> %16) #5 + %19 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %13, <64 x i32> %18) #5 + %20 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %14, <64 x i32> %18) #5 + %21 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %19) #5 + %22 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %20) #5 + %23 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %22, i32 7) #5 + %24 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %21, <32 x i32> %23) #5 + %25 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %19) #5 + %26 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %20) #5 + %27 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %26, i32 7) #5 + %28 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %25, <32 x i32> %27) #5 + %29 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %24, <32 x i32> %15) #5 + %30 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %28, <32 x i32> %15) #5 + %31 = tail call <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32> %29, <32 x i32> %30, i32 4) #5 + store <32 x i32> %31, ptr %lsr.iv, align 128 + %add19.i = add nsw i32 %elemIdx.05.i, 128 + %cmp18.i = icmp slt i32 %add19.i, %mul17.i + %cgep10 = getelementptr i8, ptr %lsr.iv, i32 128 + %cgep11 = getelementptr i8, ptr %lsr.iv3, i32 128 + %cgep12 = getelementptr i8, ptr %lsr.iv6, i32 128 + br i1 %cmp18.i, label %for.body.i, label %for.end.i + + for.end.i: ; preds = %for.body.i, %entry + ret void + } + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) + declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i32 @llvm.hexagon.S2.extractu(i32, i32 immarg, i32 immarg) #0 + + attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } + attributes #1 = { nocallback nofree nosync nounwind willreturn memory(write) } + attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } + attributes #3 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls,-small-data" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } + attributes #5 = { nounwind } + +... +--- +name: blah +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: intregs, preferred-register: '' } + - { id: 1, class: intregs, preferred-register: '' } + - { id: 2, class: hvxwr, preferred-register: '' } + - { id: 3, class: hvxwr, preferred-register: '' } + - { id: 4, class: hvxvr, preferred-register: '' } + - { id: 5, class: intregs, preferred-register: '' } + - { id: 6, class: intregs, preferred-register: '' } + - { id: 7, class: intregs, preferred-register: '' } + - { id: 8, class: intregs, preferred-register: '' } + - { id: 9, class: intregs, preferred-register: '' } + - { id: 10, class: intregs, preferred-register: '' } + - { id: 11, class: intregs, preferred-register: '' } + - { id: 12, class: intregs, preferred-register: '' } + - { id: 13, class: intregs, preferred-register: '' } + - { id: 14, class: intregs, preferred-register: '' } + - { id: 15, class: intregs, preferred-register: '' } + - { id: 16, class: intregs, preferred-register: '' } + - { id: 17, class: intregs, preferred-register: '' } + - { id: 18, class: intregs, preferred-register: '' } + - { id: 19, class: intregs, preferred-register: '' } + - { id: 20, class: intregs, preferred-register: '' } + - { id: 21, class: intregs, preferred-register: '' } + - { id: 22, class: intregs, preferred-register: '' } + - { id: 23, class: intregs, preferred-register: '' } + - { id: 24, class: intregs, preferred-register: '' } + - { id: 25, class: predregs, preferred-register: '' } + - { id: 26, class: predregs, preferred-register: '' } + - { id: 27, class: hvxvr, preferred-register: '' } + - { id: 28, class: intregs, preferred-register: '' } + - { id: 29, class: hvxvr, preferred-register: '' } + - { id: 30, class: intregs, preferred-register: '' } + - { id: 31, class: hvxvr, preferred-register: '' } + - { id: 32, class: intregs, preferred-register: '' } + - { id: 33, class: hvxvr, preferred-register: '' } + - { id: 34, class: hvxvr, preferred-register: '' } + - { id: 35, class: hvxwr, preferred-register: '' } + - { id: 36, class: hvxwr, preferred-register: '' } + - { id: 37, class: hvxwr, preferred-register: '' } + - { id: 38, class: hvxvr, preferred-register: '' } + - { id: 39, class: hvxvr, preferred-register: '' } + - { id: 40, class: intregs, preferred-register: '' } + - { id: 41, class: hvxvr, preferred-register: '' } + - { id: 42, class: hvxvr, preferred-register: '' } + - { id: 43, class: hvxvr, preferred-register: '' } + - { id: 44, class: hvxvr, preferred-register: '' } + - { id: 45, class: hvxvr, preferred-register: '' } + - { id: 46, class: hvxvr, preferred-register: '' } + - { id: 47, class: hvxvr, preferred-register: '' } + - { id: 48, class: hvxvr, preferred-register: '' } + - { id: 49, class: intregslow8, preferred-register: '' } + - { id: 50, class: hvxvr, preferred-register: '' } + - { id: 51, class: predregs, preferred-register: '' } + - { id: 52, class: intregs, preferred-register: '' } + - { id: 53, class: intregs, preferred-register: '' } + - { id: 54, class: intregs, preferred-register: '' } + - { id: 55, class: intregs, preferred-register: '' } + - { id: 56, class: intregs, preferred-register: '' } + - { id: 57, class: intregs, preferred-register: '' } + - { id: 58, class: intregs, preferred-register: '' } + - { id: 59, class: intregs, preferred-register: '' } + - { id: 60, class: intregs, preferred-register: '' } + - { id: 61, class: hvxvr, preferred-register: '' } + - { id: 62, class: intregs, preferred-register: '' } + - { id: 63, class: hvxvr, preferred-register: '' } + - { id: 64, class: intregs, preferred-register: '' } + - { id: 65, class: hvxwr, preferred-register: '' } + - { id: 66, class: hvxwr, preferred-register: '' } + - { id: 67, class: hvxwr, preferred-register: '' } + - { id: 68, class: hvxvr, preferred-register: '' } + - { id: 69, class: hvxvr, preferred-register: '' } + - { id: 70, class: hvxvr, preferred-register: '' } + - { id: 71, class: hvxvr, preferred-register: '' } + - { id: 72, class: hvxvr, preferred-register: '' } + - { id: 73, class: hvxvr, preferred-register: '' } + - { id: 74, class: hvxvr, preferred-register: '' } + - { id: 75, class: intregs, preferred-register: '' } + - { id: 76, class: intregs, preferred-register: '' } + - { id: 77, class: intregs, preferred-register: '' } + - { id: 78, class: intregs, preferred-register: '' } + - { id: 79, class: hvxvr, preferred-register: '' } + - { id: 80, class: intregs, preferred-register: '' } + - { id: 81, class: hvxvr, preferred-register: '' } + - { id: 82, class: intregs, preferred-register: '' } + - { id: 83, class: hvxwr, preferred-register: '' } + - { id: 84, class: hvxwr, preferred-register: '' } + - { id: 85, class: hvxwr, preferred-register: '' } + - { id: 86, class: hvxvr, preferred-register: '' } + - { id: 87, class: hvxvr, preferred-register: '' } + - { id: 88, class: hvxvr, preferred-register: '' } + - { id: 89, class: hvxvr, preferred-register: '' } + - { id: 90, class: hvxvr, preferred-register: '' } + - { id: 91, class: hvxvr, preferred-register: '' } + - { id: 92, class: hvxvr, preferred-register: '' } + - { id: 93, class: intregs, preferred-register: '' } + - { id: 94, class: intregs, preferred-register: '' } + - { id: 95, class: intregs, preferred-register: '' } + - { id: 96, class: intregs, preferred-register: '' } + - { id: 97, class: predregs, preferred-register: '' } + - { id: 98, class: predregs, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%16' } + - { reg: '$r1', virtual-reg: '%17' } + - { reg: '$r2', virtual-reg: '%18' } + - { reg: '$r3', virtual-reg: '%19' } + - { reg: '$r4', virtual-reg: '%20' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: + - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x40000000), %bb.3(0x40000000) + liveins: $r0, $r1, $r2, $r3, $r4 + + %20:intregs = COPY $r4 + %19:intregs = COPY $r3 + %18:intregs = COPY $r2 + %17:intregs = COPY $r1 + %16:intregs = COPY $r0 + %22:intregs = S2_extractu %16, 23, 9 + %23:intregs = S2_asl_i_r %22, 7 + %0:intregs = nsw M2_mpyi %23, %17 + %24:intregs = nsw A2_addi %17, 1 + %1:intregs = nsw M2_mpyi %24, %23 + %25:predregs = C2_cmpgt %1, %0 + J2_jumpf %25, %bb.3, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.lr.ph.i: + successors: %bb.4(0x40000000), %bb.6(0x40000000) + + %28:intregs = A2_tfrsi 269488144 + %27:hvxvr = V6_lvsplatw %28 + %30:intregs = A2_tfrsi 1077952576 + %29:hvxvr = V6_lvsplatw %30 + %2:hvxwr = REG_SEQUENCE %29, %subreg.vsub_hi, %27, %subreg.vsub_lo + %31:hvxvr = V6_vd0 + %3:hvxwr = REG_SEQUENCE %31, %subreg.vsub_hi, %31, %subreg.vsub_lo + %32:intregs = A2_tfrsi 32 + %4:hvxvr = V6_lvsplath %32 + %5:intregs = A2_add %18, %0 + %6:intregs = A2_add %20, %0 + %7:intregs = A2_add %19, %0 + %40:intregs = A2_tfrsi 7 + %49:intregslow8 = A2_tfrsi 4 + %52:intregs = A2_sub %1, %0 + %53:intregs = A2_addi %52, 127 + %54:intregs = S2_lsr_i_r %53, 7 + %55:intregs = COPY %54 + %56:intregs = S2_lsr_i_r %55, 1 + %57:intregs = A2_andir %55, 1 + %97:predregs = C2_cmpgtui %56, 0 + J2_jumpf %97, %bb.6, implicit-def $pc + J2_jump %bb.4, implicit-def $pc + + bb.4: + successors: %bb.5(0x80000000) + + J2_loop0r %bb.5, %56, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + J2_jump %bb.5, implicit-def $pc + + bb.5: + successors: %bb.5(0x40000000), %bb.6(0x40000000) + + %58:intregs = PHI %7, %bb.4, %80, %bb.5 + %59:intregs = PHI %6, %bb.4, %82, %bb.5 + %60:intregs = PHI %5, %bb.4, %93, %bb.5 + %61:hvxvr, %62:intregs = V6_vL32b_pi %58, 128 :: (load (s1024) from %ir.lsr.iv6) + %63:hvxvr, %64:intregs = V6_vL32b_pi %59, 128 :: (load (s1024) from %ir.lsr.iv3) + %65:hvxwr = REG_SEQUENCE %63, %subreg.vsub_hi, %61, %subreg.vsub_lo + %66:hvxwr = V6_vmpabusv %2, %65 + %67:hvxwr = V6_vmpabusv %3, %65 + %68:hvxvr = V6_vasrh %67.vsub_hi, %40 + %69:hvxvr = V6_vavgh %66.vsub_hi, %68 + %70:hvxvr = V6_vasrh %67.vsub_lo, %40 + %71:hvxvr = V6_vavgh %66.vsub_lo, %70 + %72:hvxvr = V6_vaddhsat %69, %4 + %73:hvxvr = V6_vaddhsat %71, %4 + %74:hvxvr = V6_vasrhbsat %72, %73, %49 + %75:intregs = V6_vS32b_pi %60, 128, %74 :: (store (s1024) into %ir.lsr.iv) + %79:hvxvr, %80:intregs = V6_vL32b_pi %62, 128 :: (load (s1024) from %ir.lsr.iv6 + 128) + %81:hvxvr, %82:intregs = V6_vL32b_pi %64, 128 :: (load (s1024) from %ir.lsr.iv3 + 128) + %83:hvxwr = REG_SEQUENCE %81, %subreg.vsub_hi, %79, %subreg.vsub_lo + %84:hvxwr = V6_vmpabusv %2, %83 + %85:hvxwr = V6_vmpabusv %3, %83 + %86:hvxvr = V6_vasrh %85.vsub_hi, %40 + %87:hvxvr = V6_vavgh %84.vsub_hi, %86 + %88:hvxvr = V6_vasrh %85.vsub_lo, %40 + %89:hvxvr = V6_vavgh %84.vsub_lo, %88 + %90:hvxvr = V6_vaddhsat %87, %4 + %91:hvxvr = V6_vaddhsat %89, %4 + %92:hvxvr = V6_vasrhbsat %90, %91, %49 + %93:intregs = V6_vS32b_pi %75, 128, %92 :: (store (s1024) into %ir.lsr.iv + 128) + ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.6, implicit-def $pc + + bb.6: + successors: %bb.7(0x40000000), %bb.8(0x40000000) + + %94:intregs = PHI %7, %bb.1, %80, %bb.5 + %95:intregs = PHI %6, %bb.1, %82, %bb.5 + %96:intregs = PHI %5, %bb.1, %93, %bb.5 + %98:predregs = C2_cmpgtui %57, 0 + J2_jumpf %98, %bb.8, implicit-def $pc + J2_jump %bb.7, implicit-def $pc + + bb.7: + successors: %bb.2(0x80000000) + + J2_jump %bb.2, implicit-def $pc + + bb.2.for.body.i (machine-block-address-taken): + successors: %bb.8(0x04000000) + + %33:hvxvr, %15:intregs = V6_vL32b_pi %94, 128 :: (load (s1024) from %ir.lsr.iv6) + %34:hvxvr, %14:intregs = V6_vL32b_pi %95, 128 :: (load (s1024) from %ir.lsr.iv3) + %35:hvxwr = REG_SEQUENCE %34, %subreg.vsub_hi, %33, %subreg.vsub_lo + %36:hvxwr = V6_vmpabusv %2, %35 + %37:hvxwr = V6_vmpabusv %3, %35 + %41:hvxvr = V6_vasrh %37.vsub_hi, %40 + %42:hvxvr = V6_vavgh %36.vsub_hi, %41 + %45:hvxvr = V6_vasrh %37.vsub_lo, %40 + %46:hvxvr = V6_vavgh %36.vsub_lo, %45 + %47:hvxvr = V6_vaddhsat %42, %4 + %48:hvxvr = V6_vaddhsat %46, %4 + %50:hvxvr = V6_vasrhbsat %47, %48, %49 + %13:intregs = V6_vS32b_pi %96, 128, %50 :: (store (s1024) into %ir.lsr.iv) + J2_jump %bb.8, implicit-def $pc + + bb.8: + successors: %bb.3(0x80000000) + + J2_jump %bb.3, implicit-def $pc + + bb.3.for.end.i: + PS_jmpret $r31, implicit-def dead $pc + +... diff --git a/llvm/test/CodeGen/Hexagon/post_inc_store.mir b/llvm/test/CodeGen/Hexagon/post_inc_store.mir new file mode 100644 index 0000000000000..3e3f51ac9114d --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/post_inc_store.mir @@ -0,0 +1,168 @@ +#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s + +# Test that we convert a post-inc load and store to a regular load and post-inc +# store. +# CHECK: J2_loop0r +# CHECK-NOT: = L2_loadruh_pi +# CHECK: L2_loadruh_io +# CHECK: S2_storerh_pi +# CHECK: ENDLOOP0 + +--- | + ; Function Attrs: nofree norecurse nounwind + define dso_local void @blam(i32 %arg, ptr nocapture %arg1, i16 signext %arg2) local_unnamed_addr #0 { + bb: + %icmp = icmp eq i32 %arg, 0 + br i1 %icmp, label %bb13, label %bb3 + + bb3: ; preds = %bb, %bb10 + %phi = phi i32 [ %add11, %bb10 ], [ 0, %bb ] + %mul = mul i32 %phi, %arg + %cgep = getelementptr i16, ptr %arg1, i32 %mul + br label %bb4 + + bb4: ; preds = %bb4, %bb3 + %lsr.iv = phi i32 [ %lsr.iv.next, %bb4 ], [ %arg, %bb3 ] + %phi5 = phi ptr [ %cgep, %bb3 ], [ %cgep1, %bb4 ] + %load = load i16, ptr %phi5, align 2 + %add = add i16 %load, %arg2 + store i16 %add, ptr %phi5, align 2 + %lsr.iv.next = add i32 %lsr.iv, -1 + %icmp8 = icmp eq i32 %lsr.iv.next, 0 + %cgep1 = getelementptr i16, ptr %phi5, i32 1 + br i1 %icmp8, label %bb10, label %bb4 + + bb10: ; preds = %bb4 + %add11 = add nuw i32 %phi, 1 + %icmp12 = icmp eq i32 %add11, %arg + br i1 %icmp12, label %bb13, label %bb3 + + bb13: ; preds = %bb10, %bb + ret void + } + + attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" } + +... +--- +name: blam +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: intregs, preferred-register: '' } + - { id: 1, class: intregs, preferred-register: '' } + - { id: 2, class: intregs, preferred-register: '' } + - { id: 3, class: intregs, preferred-register: '' } + - { id: 4, class: intregs, preferred-register: '' } + - { id: 5, class: intregs, preferred-register: '' } + - { id: 6, class: intregs, preferred-register: '' } + - { id: 7, class: intregs, preferred-register: '' } + - { id: 8, class: intregs, preferred-register: '' } + - { id: 9, class: intregs, preferred-register: '' } + - { id: 10, class: intregs, preferred-register: '' } + - { id: 11, class: intregs, preferred-register: '' } + - { id: 12, class: predregs, preferred-register: '' } + - { id: 13, class: intregs, preferred-register: '' } + - { id: 14, class: intregs, preferred-register: '' } + - { id: 15, class: intregs, preferred-register: '' } + - { id: 16, class: predregs, preferred-register: '' } + - { id: 17, class: predregs, preferred-register: '' } + - { id: 18, class: predregs, preferred-register: '' } + - { id: 19, class: predregs, preferred-register: '' } + - { id: 20, class: intregs, preferred-register: '' } + - { id: 21, class: intregs, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%7' } + - { reg: '$r1', virtual-reg: '%8' } + - { reg: '$r2', virtual-reg: '%9' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.bb: + successors: %bb.4(0x30000000), %bb.5(0x50000000) + liveins: $r0, $r1, $r2 + + %9:intregs = COPY $r2 + %8:intregs = COPY $r1 + %7:intregs = COPY $r0 + %21:intregs = COPY %7 + %20:intregs = COPY %7 + %12:predregs = C2_cmpeqi %7, 0 + J2_jumpt %12, %bb.4, implicit-def $pc + + bb.5: + successors: %bb.1(0x80000000) + + %11:intregs = A2_tfrsi 0 + J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1 + + bb.1.bb3 (machine-block-address-taken): + successors: %bb.2(0x80000000) + + %0:intregs = PHI %11, %bb.5, %6, %bb.3 + %13:intregs = M2_mpyi %0, %7 + %1:intregs = S2_addasl_rrri %8, %13, 1 + J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.2.bb4 (machine-block-address-taken): + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %3:intregs = PHI %1, %bb.1, %5, %bb.2 + %14:intregs = L2_loadruh_io %3, 0 :: (load (s16) from %ir.phi5) + %15:intregs = A2_add %14, %9 + %5:intregs = S2_storerh_pi %3, 2, %15 :: (store (s16) into %ir.phi5) + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def dead $pc + + bb.3.bb10: + successors: %bb.4(0x04000000), %bb.1(0x7c000000) + + %6:intregs = nuw A2_addi %0, 1 + ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1 + J2_jump %bb.4, implicit-def dead $pc + + bb.4.bb13: + PS_jmpret $r31, implicit-def dead $pc + +... diff --git a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir new file mode 100644 index 0000000000000..e22053421791d --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir @@ -0,0 +1,58 @@ +# RUN: llc -march=hexagon -run-pass=hexagon-postincopt %s -o /dev/null +# REQUIRES: asserts +# Test that we do not hit unreachable code dealt with L4_ior_memoph_io. + +... +--- +name: foo +alignment: 4 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.4(0x30000000), %bb.5(0x50000000) + liveins: $r0, $r1, $r2 + + %9:intregs = COPY $r2 + %8:intregs = COPY $r1 + %7:intregs = COPY $r0 + %21:intregs = COPY %7 + %20:intregs = COPY %7 + %12:predregs = C2_cmpeqi %7, 0 + J2_jumpt %12, %bb.4, implicit-def $pc + + bb.5: + successors: %bb.1(0x80000000) + + %11:intregs = A2_tfrsi 0 + J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1 + + bb.1: + successors: %bb.2(0x80000000) + + %0:intregs = PHI %11, %bb.5, %6, %bb.3 + %13:intregs = M2_mpyi %0, %7 + %1:intregs = S2_addasl_rrri %8, %13, 1 + J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.2: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %3:intregs = PHI %1, %bb.1, %5, %bb.2 + %14:intregs = L2_loadruh_io %3, 0 + L4_ior_memoph_io %3:intregs, 0, 21 + %15:intregs = A2_add %14, %9 + %5:intregs = S2_storerh_pi %3, 2, %15 + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def dead $pc + + bb.3: + successors: %bb.4(0x04000000), %bb.1(0x7c000000) + + %6:intregs = nuw A2_addi %0, 1 + ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1 + J2_jump %bb.4, implicit-def dead $pc + + bb.4: + PS_jmpret $r31, implicit-def dead $pc + +... diff --git a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir new file mode 100644 index 0000000000000..27d653c99f7b8 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir @@ -0,0 +1,19 @@ +# RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s +# Check that this doesn't crash. +# CHECK: Y2_dcfetchbo + +name: fred +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1 + %0:intregs = IMPLICIT_DEF + + bb.1: + successors: %bb.1 + + %1:intregs = PHI %0:intregs, %bb.0, %2:intregs, %bb.1 + Y2_dcfetchbo %1:intregs, 0 + %2:intregs = A2_addi %1:intregs, 1 + J2_jump %bb.1, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir new file mode 100644 index 0000000000000..fca42d547dfbc --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir @@ -0,0 +1,32 @@ +# RUN: llc -march=hexagon -run-pass hexagon-postincopt -o - %s | FileCheck %s +# REQUIRES: asserts + +# Check that this doesn't crash: +# CHECK: L2_loadbsw4_io + +--- +name: fred +tracksRegLiveness: true +liveins: + - { reg: '$r0', virtual-reg: '%0' } +body: | + bb.0: + successors: %bb.1(0x80000000) + liveins: $r0 + + %0:intregs = COPY $r0 + %1:intregs = A2_tfrsi 240 + %2:doubleregs = IMPLICIT_DEF + %3:doubleregs = IMPLICIT_DEF + + bb.1: + successors: %bb.1(0x80000000) + + %4:intregs = PHI %1, %bb.0, %5, %bb.1 + %6:doubleregs = L2_loadbsw4_io %4, 0 + %7:doubleregs = M2_vrmac_s0 %2, %6, %3 + S2_storeri_io %0, 0, %7.isub_lo + %5:intregs = nuw A2_addi %4, 256 + J2_jump %bb.1, implicit-def dead $pc + +... From a976e3c6959209f6f011260f64e4705ee84b47e8 Mon Sep 17 00:00:00 2001 From: PiJoules <6019989+PiJoules@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:53:53 -0800 Subject: [PATCH 167/351] [compiler-rt][Fuchsia] Propogate raw_report to UnmapOrDieVmar (#82566) As of #77488, UnmapOrDie now accepts raw_report which allows the program to crash without calling Report(). We should propogate this value through UnmapOrDieVmar and have that call ReportMunmapFailureAndDie which uses `raw_report`. --- .../lib/sanitizer_common/sanitizer_fuchsia.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp index 2f291f7ca9ea1..a67b2a8725eca 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp @@ -288,7 +288,8 @@ uptr ReservedAddressRange::MapOrDie(uptr fixed_addr, uptr map_size, name ? name : name_, true); } -void UnmapOrDieVmar(void *addr, uptr size, zx_handle_t target_vmar) { +void UnmapOrDieVmar(void *addr, uptr size, zx_handle_t target_vmar, + bool raw_report) { if (!addr || !size) return; size = RoundUpTo(size, GetPageSize()); @@ -301,11 +302,8 @@ void UnmapOrDieVmar(void *addr, uptr size, zx_handle_t target_vmar) { status = _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast(addr), size); } - if (status != ZX_OK) { - Report("ERROR: %s failed to deallocate 0x%zx (%zd) bytes at address %p\n", - SanitizerToolName, size, size, addr); - CHECK("unable to unmap" && 0); - } + if (status != ZX_OK) + ReportMunmapFailureAndDie(addr, size, status, raw_report); DecreaseTotalMmap(size); } @@ -327,7 +325,8 @@ void ReservedAddressRange::Unmap(uptr addr, uptr size) { } // Partial unmapping does not affect the fact that the initial range is still // reserved, and the resulting unmapped memory can't be reused. - UnmapOrDieVmar(reinterpret_cast(addr), size, vmar); + UnmapOrDieVmar(reinterpret_cast(addr), size, vmar, + /*raw_report=*/false); } // This should never be called. @@ -413,8 +412,8 @@ void *MmapAlignedOrDieOnFatalError(uptr size, uptr alignment, return reinterpret_cast(addr); } -void UnmapOrDie(void *addr, uptr size, bool) { - UnmapOrDieVmar(addr, size, gSanitizerHeapVmar); +void UnmapOrDie(void *addr, uptr size, bool raw_report) { + UnmapOrDieVmar(addr, size, gSanitizerHeapVmar, raw_report); } void ReleaseMemoryPagesToOS(uptr beg, uptr end) { From ba31a195f5f2efc17bee8cf3be4260badc578615 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 22 Feb 2024 01:57:57 +0000 Subject: [PATCH 168/351] [gn build] Port 4c0fdcdb3307 --- llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn index c3cafe58a1adf..99bea15057873 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn @@ -57,6 +57,7 @@ static_library("LLVMHexagonCodeGen") { "HexagonFrameLowering.cpp", "HexagonGenExtract.cpp", "HexagonGenInsert.cpp", + "HexagonGenMemAbsolute.cpp", "HexagonGenMux.cpp", "HexagonGenPredicate.cpp", "HexagonHardwareLoops.cpp", From dd6d059da5a75689666e555058ade7a83e81d29f Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 22 Feb 2024 01:57:57 +0000 Subject: [PATCH 169/351] [gn build] Port d62ca8def395 --- llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn index 99bea15057873..09b5811d7d122 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn @@ -75,6 +75,7 @@ static_library("LLVMHexagonCodeGen") { "HexagonOptAddrMode.cpp", "HexagonOptimizeSZextends.cpp", "HexagonPeephole.cpp", + "HexagonPostIncOpt.cpp", "HexagonRDFOpt.cpp", "HexagonRegisterInfo.cpp", "HexagonSelectionDAGInfo.cpp", From 99822be6f08e42eef38913a128996a93e8292f73 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 13:59:31 -0700 Subject: [PATCH 170/351] Apply clang-tidy fixes for readability-identifier-naming in SerializationTest.cpp (NFC) --- mlir/unittests/Dialect/SPIRV/SerializationTest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp index 3a6bcbd999a57..9d2f690ed898a 100644 --- a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp +++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp @@ -77,7 +77,7 @@ class SerializationTest : public ::testing::Test { } // Inserts an Integer or a Vector of Integers constant of value 'val'. - spirv::ConstantOp AddConstInt(Type type, const APInt &val) { + spirv::ConstantOp addConstInt(Type type, const APInt &val) { OpBuilder builder(module->getRegion()); auto loc = UnknownLoc::get(&context); @@ -181,8 +181,8 @@ TEST_F(SerializationTest, SignlessVsSignedIntegerConstantBitExtension) { APInt signedIntConstVal(signedInt16Type.getWidth(), -1, signedInt16Type.getSignedness()); - AddConstInt(signlessInt16Type, signlessIntConstVal); - AddConstInt(signedInt16Type, signedIntConstVal); + addConstInt(signlessInt16Type, signlessIntConstVal); + addConstInt(signedInt16Type, signedIntConstVal); ASSERT_TRUE(succeeded(spirv::serialize(module.get(), binary))); auto hasSignlessVal = [&](spirv::Opcode opcode, ArrayRef operands) { From 443247993cb8562f1308aab5ee0a9404983707d0 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 14:09:53 -0700 Subject: [PATCH 171/351] Apply clang-tidy fixes for llvm-qualified-auto in InterfaceAttachmentTest.cpp (NFC) --- mlir/unittests/IR/InterfaceAttachmentTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp index 2e1309ad776fe..16de34c45ec6e 100644 --- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp +++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp @@ -421,7 +421,7 @@ TEST(InterfaceAttachmentTest, PromisedInterfaces) { // Attribute interfaces use the exact same mechanism as types, so just check // that the promise mechanism works for attributes. MLIRContext context; - auto testDialect = context.getOrLoadDialect(); + auto *testDialect = context.getOrLoadDialect(); auto attr = test::SimpleAAttr::get(&context); // `SimpleAAttr` doesn't implement nor promises the From df8d5c17802b162c5d20300426f03d6fb970d2a2 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 14:12:49 -0700 Subject: [PATCH 172/351] Apply clang-tidy fixes for llvm-qualified-auto in OperationSupportTest.cpp (NFC) --- mlir/unittests/IR/OperationSupportTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp index 8a4f67b926274..9d75615b39c0c 100644 --- a/mlir/unittests/IR/OperationSupportTest.cpp +++ b/mlir/unittests/IR/OperationSupportTest.cpp @@ -295,9 +295,9 @@ TEST(OperationEquivalenceTest, HashWorksWithFlags) { MLIRContext context; context.getOrLoadDialect(); - auto op1 = createOp(&context); + auto *op1 = createOp(&context); // `op1` has an unknown loc. - auto op2 = createOp(&context); + auto *op2 = createOp(&context); op2->setLoc(NameLoc::get(StringAttr::get(&context, "foo"))); auto getHash = [](Operation *op, OperationEquivalence::Flags flags) { return OperationEquivalence::computeHash( From fa25433d433932b1b8fd296206b1bcd974afecad Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 14:26:39 -0700 Subject: [PATCH 173/351] Apply clang-tidy fixes for modernize-use-override in SerializeNVVMTarget.cpp (NFC) --- mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp index a00ebba7b97e6..924708f99902f 100644 --- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp +++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp @@ -37,7 +37,7 @@ using namespace mlir; class MLIRTargetLLVMNVVM : public ::testing::Test { protected: - virtual void SetUp() { + void SetUp() override { registerBuiltinDialectTranslation(registry); registerLLVMDialectTranslation(registry); registerGPUDialectTranslation(registry); From 0d12628d06b8ab37157faea474548735ddb7eeb2 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 20 Oct 2023 14:27:24 -0700 Subject: [PATCH 174/351] Apply clang-tidy fixes for readability-container-size-empty in SerializeNVVMTarget.cpp (NFC) --- mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp index 924708f99902f..26bfbd5c11e81 100644 --- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp +++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp @@ -85,7 +85,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMMToLLVM)) { serializer.serializeToObject(gpuModule, options); // Check that the serializer was successful. ASSERT_TRUE(object != std::nullopt); - ASSERT_TRUE(object->size() > 0); + ASSERT_TRUE(!object->empty()); // Read the serialized module. llvm::MemoryBufferRef buffer(StringRef(object->data(), object->size()), @@ -121,7 +121,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToPTX)) { serializer.serializeToObject(gpuModule, options); // Check that the serializer was successful. ASSERT_TRUE(object != std::nullopt); - ASSERT_TRUE(object->size() > 0); + ASSERT_TRUE(!object->empty()); ASSERT_TRUE( StringRef(object->data(), object->size()).contains("nvvm_kernel")); @@ -151,6 +151,6 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToBinary)) { serializer.serializeToObject(gpuModule, options); // Check that the serializer was successful. ASSERT_TRUE(object != std::nullopt); - ASSERT_TRUE(object->size() > 0); + ASSERT_TRUE(!object->empty()); } } From 1eeeab82c6eb185f5139e633a59c2dbcb15616e4 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 21 Feb 2024 20:39:02 -0600 Subject: [PATCH 175/351] [lldb][test] Modernize assertEqual(value, bool) (#82526) Any time we see the pattern `assertEqual(value, bool)`, we can replace that with `assert(value)`. Likewise for `assertNotEqual`. Technically this relaxes the test a bit, as we may want to make sure `value` is either `True` or `False`, and not something that implicitly converts to a bool. For example, `assertEqual("foo", True)` will fail, but `assertTrue("foo")` will not. In most cases, this distinction is not important. There are two such places that this patch does **not** transform, since it seems intentional that we want the result to be a bool: * https://github.com/llvm/llvm-project/blob/5daf2001a1e4d71ce1273a1e7e31cf6e6ac37c10/lldb/test/API/python_api/sbstructureddata/TestStructuredDataAPI.py#L90 * https://github.com/llvm/llvm-project/blob/5daf2001a1e4d71ce1273a1e7e31cf6e6ac37c10/lldb/test/API/commands/settings/TestSettings.py#L940 Followup to 9c2468821ec51defd09c246fea4a47886fff8c01. I patched `teyit` with a `visit_assertEqual` node handler to generate this. --- .../call-throws/TestCallThatThrows.py | 2 +- .../expression/dont_allow_jit/TestAllowJIT.py | 8 ++-- .../commands/statistics/basic/TestStats.py | 34 ++++++------- lldb/test/API/commands/trace/TestTraceSave.py | 8 ++-- .../TestBadAddressBreakpoints.py | 2 +- .../TestBreakpointCommand.py | 12 ++--- .../breakpoint_names/TestBreakpointNames.py | 20 +++----- .../TestJLink6Armv7RegisterDefinition.py | 2 +- .../simple_exe/TestModuleCacheSimple.py | 13 ++--- .../stats_api/TestStatisticsAPI.py | 48 +++++++++---------- .../backtrace_limit/TestBacktraceLimit.py | 2 +- .../TestArmMachoCorefileRegctx.py | 4 +- .../addrable-bits/TestAddrableBitsCorefile.py | 2 +- .../TestFirmwareCorefiles.py | 14 +++--- .../kern-ver-str/TestKernVerStrLCNOTE.py | 2 +- .../TestMultipleBinaryCorefile.py | 2 +- lldb/test/API/macosx/queues/TestQueues.py | 3 +- .../safe-to-func-call/TestSafeFuncCalls.py | 3 +- .../TestRunCommandInterpreterAPI.py | 28 +++++------ 19 files changed, 95 insertions(+), 114 deletions(-) diff --git a/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py b/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py index 2868ec5ffdbdf..b8cc87c93ba61 100644 --- a/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py +++ b/lldb/test/API/commands/expression/call-throws/TestCallThatThrows.py @@ -46,7 +46,7 @@ def call_function(self): value = frame.EvaluateExpression("[my_class callMeIThrow]", options) self.assertTrue(value.IsValid()) - self.assertEqual(value.GetError().Success(), False) + self.assertFalse(value.GetError().Success()) self.check_after_call() diff --git a/lldb/test/API/commands/expression/dont_allow_jit/TestAllowJIT.py b/lldb/test/API/commands/expression/dont_allow_jit/TestAllowJIT.py index 307d4521427dc..eb812f1902e66 100644 --- a/lldb/test/API/commands/expression/dont_allow_jit/TestAllowJIT.py +++ b/lldb/test/API/commands/expression/dont_allow_jit/TestAllowJIT.py @@ -54,7 +54,7 @@ def expr_options_test(self): # First make sure we can call the function with the default option set. options = lldb.SBExpressionOptions() # Check that the default is to allow JIT: - self.assertEqual(options.GetAllowJIT(), True, "Default is true") + self.assertTrue(options.GetAllowJIT(), "Default is true") # Now use the options: result = frame.EvaluateExpression("call_me(10)", options) @@ -64,9 +64,7 @@ def expr_options_test(self): # Now disallow JIT and make sure it fails: options.SetAllowJIT(False) # Check that we got the right value: - self.assertEqual( - options.GetAllowJIT(), False, "Got False after setting to False" - ) + self.assertFalse(options.GetAllowJIT(), "Got False after setting to False") # Again use it and ensure we fail: result = frame.EvaluateExpression("call_me(10)", options) @@ -79,7 +77,7 @@ def expr_options_test(self): # Finally set the allow JIT value back to true and make sure that works: options.SetAllowJIT(True) - self.assertEqual(options.GetAllowJIT(), True, "Set back to True correctly") + self.assertTrue(options.GetAllowJIT(), "Set back to True correctly") # And again, make sure this works: result = frame.EvaluateExpression("call_me(10)", options) diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py index 6f083222227fb..fb6fc07d2d443 100644 --- a/lldb/test/API/commands/statistics/basic/TestStats.py +++ b/lldb/test/API/commands/statistics/basic/TestStats.py @@ -35,17 +35,13 @@ def test_enable_disable(self): ) def verify_key_in_dict(self, key, d, description): - self.assertEqual( - key in d, - True, - 'make sure key "%s" is in dictionary %s' % (key, description), + self.assertIn( + key, d, 'make sure key "%s" is in dictionary %s' % (key, description) ) def verify_key_not_in_dict(self, key, d, description): - self.assertEqual( - key in d, - False, - 'make sure key "%s" is in dictionary %s' % (key, description), + self.assertNotIn( + key, d, 'make sure key "%s" is in dictionary %s' % (key, description) ) def verify_keys(self, dict, description, keys_exist, keys_missing=None): @@ -120,9 +116,7 @@ def test_expressions_frame_var_counts(self): self.verify_success_fail_count(stats, "frameVariable", 1, 0) # Test that "stopCount" is available when the process has run - self.assertEqual( - "stopCount" in stats, True, 'ensure "stopCount" is in target JSON' - ) + self.assertIn("stopCount", stats, 'ensure "stopCount" is in target JSON') self.assertGreater( stats["stopCount"], 0, 'make sure "stopCount" is greater than zero' ) @@ -484,9 +478,9 @@ def test_dsym_binary_has_symfile_in_stats(self): exe = self.getBuildArtifact(exe_name) dsym = self.getBuildArtifact(exe_name + ".dSYM") # Make sure the executable file exists after building. - self.assertEqual(os.path.exists(exe), True) + self.assertTrue(os.path.exists(exe)) # Make sure the dSYM file exists after building. - self.assertEqual(os.path.isdir(dsym), True) + self.assertTrue(os.path.isdir(dsym)) # Create the target target = self.createTestTarget(file_path=exe) @@ -532,9 +526,9 @@ def test_no_dsym_binary_has_symfile_identifiers_in_stats(self): exe = self.getBuildArtifact(exe_name) dsym = self.getBuildArtifact(exe_name + ".dSYM") # Make sure the executable file exists after building. - self.assertEqual(os.path.exists(exe), True) + self.assertTrue(os.path.exists(exe)) # Make sure the dSYM file doesn't exist after building. - self.assertEqual(os.path.isdir(dsym), False) + self.assertFalse(os.path.isdir(dsym)) # Create the target target = self.createTestTarget(file_path=exe) @@ -585,11 +579,11 @@ def test_had_frame_variable_errors(self): dsym = self.getBuildArtifact(exe_name + ".dSYM") main_obj = self.getBuildArtifact("main.o") # Make sure the executable file exists after building. - self.assertEqual(os.path.exists(exe), True) + self.assertTrue(os.path.exists(exe)) # Make sure the dSYM file doesn't exist after building. - self.assertEqual(os.path.isdir(dsym), False) + self.assertFalse(os.path.isdir(dsym)) # Make sure the main.o object file exists after building. - self.assertEqual(os.path.exists(main_obj), True) + self.assertTrue(os.path.exists(main_obj)) # Delete the main.o file that contains the debug info so we force an # error when we run to main and try to get variables @@ -604,7 +598,7 @@ def test_had_frame_variable_errors(self): # Make sure we have "debugInfoHadVariableErrors" variable that is set to # false before failing to get local variables due to missing .o file. - self.assertEqual(exe_stats["debugInfoHadVariableErrors"], False) + self.assertFalse(exe_stats["debugInfoHadVariableErrors"]) # Verify that the top level statistic that aggregates the number of # modules with debugInfoHadVariableErrors is zero @@ -624,7 +618,7 @@ def test_had_frame_variable_errors(self): # Make sure we have "hadFrameVariableErrors" variable that is set to # true after failing to get local variables due to missing .o file. - self.assertEqual(exe_stats["debugInfoHadVariableErrors"], True) + self.assertTrue(exe_stats["debugInfoHadVariableErrors"]) # Verify that the top level statistic that aggregates the number of # modules with debugInfoHadVariableErrors is greater than zero diff --git a/lldb/test/API/commands/trace/TestTraceSave.py b/lldb/test/API/commands/trace/TestTraceSave.py index ef1ab2f7aa41c..af38669cb4fce 100644 --- a/lldb/test/API/commands/trace/TestTraceSave.py +++ b/lldb/test/API/commands/trace/TestTraceSave.py @@ -179,11 +179,11 @@ def testSaveTrace(self): res = lldb.SBCommandReturnObject() ci.HandleCommand("thread trace dump instructions -c 10 --forwards", res) - self.assertEqual(res.Succeeded(), True) + self.assertTrue(res.Succeeded()) first_ten_instructions = res.GetOutput() ci.HandleCommand("thread trace dump instructions -c 10", res) - self.assertEqual(res.Succeeded(), True) + self.assertTrue(res.Succeeded()) last_ten_instructions = res.GetOutput() # Now, save the trace to @@ -203,11 +203,11 @@ def testSaveTrace(self): # Compare with instructions saved at the first time ci.HandleCommand("thread trace dump instructions -c 10 --forwards", res) - self.assertEqual(res.Succeeded(), True) + self.assertTrue(res.Succeeded()) self.assertEqual(res.GetOutput(), first_ten_instructions) ci.HandleCommand("thread trace dump instructions -c 10", res) - self.assertEqual(res.Succeeded(), True) + self.assertTrue(res.Succeeded()) self.assertEqual(res.GetOutput(), last_ten_instructions) def testSaveKernelTrace(self): diff --git a/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py b/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py index 0ab11a427c100..d120692e4d6e6 100644 --- a/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py +++ b/lldb/test/API/functionalities/breakpoint/address_breakpoints/TestBadAddressBreakpoints.py @@ -40,7 +40,7 @@ def address_breakpoints(self): bkpt = target.BreakpointCreateByAddress(illegal_address) # Verify that breakpoint is not resolved. for bp_loc in bkpt: - self.assertEqual(bp_loc.IsResolved(), False) + self.assertFalse(bp_loc.IsResolved()) else: self.fail( "Could not find an illegal address at which to set a bad breakpoint." diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py index 620f648d51fd2..ea242952e409e 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py @@ -572,9 +572,9 @@ def verify_source_map_deduce_statistics(self, target, expected_count): res = target.GetStatistics().GetAsJSON(stream) self.assertTrue(res.Success()) debug_stats = json.loads(stream.GetData()) - self.assertEqual( - "targets" in debug_stats, - True, + self.assertIn( + "targets", + debug_stats, 'Make sure the "targets" key in in target.GetStatistics()', ) target_stats = debug_stats["targets"][0] @@ -659,9 +659,9 @@ def test_breakpoint_statistics_hitcount(self): res = target.GetStatistics().GetAsJSON(stream) self.assertTrue(res.Success()) debug_stats = json.loads(stream.GetData()) - self.assertEqual( - "targets" in debug_stats, - True, + self.assertIn( + "targets", + debug_stats, 'Make sure the "targets" key in in target.GetStatistics()', ) target_stats = debug_stats["targets"][0] diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py b/lldb/test/API/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py index 330f916a907e6..0f9510c4507d0 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py @@ -389,7 +389,7 @@ def do_check_configuring_names(self): ) def check_permission_results(self, bp_name): - self.assertEqual(bp_name.GetAllowDelete(), False, "Didn't set allow delete.") + self.assertFalse(bp_name.GetAllowDelete(), "Didn't set allow delete.") protected_bkpt = self.target.BreakpointCreateByLocation(self.main_file_spec, 10) protected_id = protected_bkpt.GetID() @@ -402,14 +402,11 @@ def check_permission_results(self, bp_name): self.assertSuccess(success, "Couldn't add this name to the breakpoint") self.target.DisableAllBreakpoints() - self.assertEqual( - protected_bkpt.IsEnabled(), - True, - "Didnt' keep breakpoint from being disabled", + self.assertTrue( + protected_bkpt.IsEnabled(), "Didnt' keep breakpoint from being disabled" ) - self.assertEqual( + self.assertFalse( unprotected_bkpt.IsEnabled(), - False, "Protected too many breakpoints from disabling.", ) @@ -418,14 +415,11 @@ def check_permission_results(self, bp_name): result = lldb.SBCommandReturnObject() self.dbg.GetCommandInterpreter().HandleCommand("break disable", result) self.assertTrue(result.Succeeded()) - self.assertEqual( - protected_bkpt.IsEnabled(), - True, - "Didnt' keep breakpoint from being disabled", + self.assertTrue( + protected_bkpt.IsEnabled(), "Didnt' keep breakpoint from being disabled" ) - self.assertEqual( + self.assertFalse( unprotected_bkpt.IsEnabled(), - False, "Protected too many breakpoints from disabling.", ) diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py b/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py index eb7c036c8d600..3a426620af559 100644 --- a/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py @@ -198,7 +198,7 @@ def QListThreadsInStopReply(self): error = lldb.SBError() data = lldb.SBData() data.SetData(error, val, lldb.eByteOrderBig, 4) - self.assertEqual(r1_valobj.SetData(data, error), True) + self.assertTrue(r1_valobj.SetData(data, error)) self.assertSuccess(error) r1_valobj = process.GetThreadAtIndex(0).GetFrameAtIndex(0).FindRegister("r1") diff --git a/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py b/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py index 4214bd108bfc5..abf4cf3944e14 100644 --- a/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py +++ b/lldb/test/API/functionalities/module_cache/simple_exe/TestModuleCacheSimple.py @@ -66,18 +66,16 @@ def test(self): # get a different creation and modification time for the file since some # OSs store the modification time in seconds since Jan 1, 1970. os.remove(exe) - self.assertEqual( - os.path.exists(exe), - False, - "make sure we were able to remove the executable", + self.assertFalse( + os.path.exists(exe), "make sure we were able to remove the executable" ) time.sleep(2) # Now rebuild the binary so it has a different content which should # update the UUID to make the cache miss when it tries to load the # symbol table from the binary at the same path. self.build(dictionary={"CFLAGS_EXTRAS": "-DEXTRA_FUNCTION"}) - self.assertEqual( - os.path.exists(exe), True, "make sure executable exists after rebuild" + self.assertTrue( + os.path.exists(exe), "make sure executable exists after rebuild" ) # Make sure the modification time has changed or this test will fail. exe_mtime_2 = os.path.getmtime(exe) @@ -99,9 +97,8 @@ def test(self): main_module = target.GetModuleAtIndex(0) self.assertTrue(main_module.IsValid()) main_module.GetNumSymbols() - self.assertEqual( + self.assertTrue( os.path.exists(symtab_cache_path), - True, 'make sure "symtab" cache files exists after cache is updated', ) symtab_mtime_2 = os.path.getmtime(symtab_cache_path) diff --git a/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py b/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py index eee91bfadead9..851097bdfecf2 100644 --- a/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py +++ b/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py @@ -33,47 +33,47 @@ def test_stats_api(self): stream = lldb.SBStream() res = stats.GetAsJSON(stream) debug_stats = json.loads(stream.GetData()) - self.assertEqual( - "targets" in debug_stats, - True, + self.assertIn( + "targets", + debug_stats, 'Make sure the "targets" key in in target.GetStatistics()', ) - self.assertEqual( - "modules" in debug_stats, - True, + self.assertIn( + "modules", + debug_stats, 'Make sure the "modules" key in in target.GetStatistics()', ) stats_json = debug_stats["targets"][0] - self.assertEqual( - "expressionEvaluation" in stats_json, - True, + self.assertIn( + "expressionEvaluation", + stats_json, 'Make sure the "expressionEvaluation" key in in target.GetStatistics()["targets"][0]', ) - self.assertEqual( - "frameVariable" in stats_json, - True, + self.assertIn( + "frameVariable", + stats_json, 'Make sure the "frameVariable" key in in target.GetStatistics()["targets"][0]', ) expressionEvaluation = stats_json["expressionEvaluation"] - self.assertEqual( - "successes" in expressionEvaluation, - True, + self.assertIn( + "successes", + expressionEvaluation, 'Make sure the "successes" key in in "expressionEvaluation" dictionary"', ) - self.assertEqual( - "failures" in expressionEvaluation, - True, + self.assertIn( + "failures", + expressionEvaluation, 'Make sure the "failures" key in in "expressionEvaluation" dictionary"', ) frameVariable = stats_json["frameVariable"] - self.assertEqual( - "successes" in frameVariable, - True, + self.assertIn( + "successes", + frameVariable, 'Make sure the "successes" key in in "frameVariable" dictionary"', ) - self.assertEqual( - "failures" in frameVariable, - True, + self.assertIn( + "failures", + frameVariable, 'Make sure the "failures" key in in "frameVariable" dictionary"', ) diff --git a/lldb/test/API/functionalities/thread/backtrace_limit/TestBacktraceLimit.py b/lldb/test/API/functionalities/thread/backtrace_limit/TestBacktraceLimit.py index 98baea45ce894..fded504f7c612 100644 --- a/lldb/test/API/functionalities/thread/backtrace_limit/TestBacktraceLimit.py +++ b/lldb/test/API/functionalities/thread/backtrace_limit/TestBacktraceLimit.py @@ -23,5 +23,5 @@ def test_backtrace_depth(self): interp.HandleCommand( "settings set target.process.thread.max-backtrace-depth 30", result ) - self.assertEqual(True, result.Succeeded()) + self.assertTrue(result.Succeeded()) self.assertEqual(30, thread.GetNumFrames()) diff --git a/lldb/test/API/macosx/arm-corefile-regctx/TestArmMachoCorefileRegctx.py b/lldb/test/API/macosx/arm-corefile-regctx/TestArmMachoCorefileRegctx.py index 1ecb0f466e78d..4190ea3ac3318 100644 --- a/lldb/test/API/macosx/arm-corefile-regctx/TestArmMachoCorefileRegctx.py +++ b/lldb/test/API/macosx/arm-corefile-regctx/TestArmMachoCorefileRegctx.py @@ -28,7 +28,7 @@ def test_armv7_corefile(self): target = self.dbg.CreateTarget("") err = lldb.SBError() process = target.LoadCore(self.corefile) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) thread = process.GetSelectedThread() frame = thread.GetSelectedFrame() @@ -51,7 +51,7 @@ def test_arm64_corefile(self): target = self.dbg.CreateTarget("") err = lldb.SBError() process = target.LoadCore(self.corefile) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) thread = process.GetSelectedThread() frame = thread.GetSelectedFrame() diff --git a/lldb/test/API/macosx/lc-note/addrable-bits/TestAddrableBitsCorefile.py b/lldb/test/API/macosx/lc-note/addrable-bits/TestAddrableBitsCorefile.py index 221fe6260088e..e56ecfcb14d4b 100644 --- a/lldb/test/API/macosx/lc-note/addrable-bits/TestAddrableBitsCorefile.py +++ b/lldb/test/API/macosx/lc-note/addrable-bits/TestAddrableBitsCorefile.py @@ -29,7 +29,7 @@ def test_lc_note_addrable_bits(self): (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( self, "break here", lldb.SBFileSpec("main.c") ) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) found_main = False for f in thread.frames: diff --git a/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py b/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py index b9d2055e83a56..db3074d7e7942 100644 --- a/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py +++ b/lldb/test/API/macosx/lc-note/firmware-corefile/TestFirmwareCorefiles.py @@ -73,7 +73,7 @@ def test_lc_note_version_string(self): if self.TraceOn(): self.runCmd("script print('loading corefile %s')" % verstr_corefile) process = target.LoadCore(verstr_corefile) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.runCmd("target mod dump sections") @@ -91,7 +91,7 @@ def test_lc_note_version_string(self): "script print('loading corefile %s')" % verstr_corefile_invalid_ident ) process = target.LoadCore(verstr_corefile_invalid_ident) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) # Third, try the "kern ver str" corefile where it loads at an address target = self.dbg.CreateTarget("") @@ -99,7 +99,7 @@ def test_lc_note_version_string(self): if self.TraceOn(): self.runCmd("script print('loading corefile %s')" % verstr_corefile_addr) process = target.LoadCore(verstr_corefile_addr) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.runCmd("target mod dump sections") @@ -178,7 +178,7 @@ def test_lc_note_main_bin_spec(self): if self.TraceOn(): self.runCmd("script print('loading corefile %s')" % binspec_corefile) process = target.LoadCore(binspec_corefile) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.runCmd("target mod dump sections") @@ -192,7 +192,7 @@ def test_lc_note_main_bin_spec(self): if self.TraceOn(): self.runCmd("script print('loading corefile %s')" % binspec_corefile_addr) process = target.LoadCore(binspec_corefile_addr) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.runCmd("target mod dump sections") @@ -212,7 +212,7 @@ def test_lc_note_main_bin_spec(self): "script print('loading corefile %s')" % binspec_corefile_slideonly ) process = target.LoadCore(binspec_corefile_slideonly) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.runCmd("target mod dump sections") @@ -352,7 +352,7 @@ def test_lc_note_main_bin_spec_os_plugin(self): ) process = target.LoadCore(binspec_corefile_addr) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.runCmd("target mod dump sections") diff --git a/lldb/test/API/macosx/lc-note/kern-ver-str/TestKernVerStrLCNOTE.py b/lldb/test/API/macosx/lc-note/kern-ver-str/TestKernVerStrLCNOTE.py index 9713c4a85cd62..d4366196c53c3 100644 --- a/lldb/test/API/macosx/lc-note/kern-ver-str/TestKernVerStrLCNOTE.py +++ b/lldb/test/API/macosx/lc-note/kern-ver-str/TestKernVerStrLCNOTE.py @@ -94,7 +94,7 @@ def test_lc_note(self): self.target = self.dbg.CreateTarget("") err = lldb.SBError() self.process = self.target.LoadCore(self.corefile) - self.assertEqual(self.process.IsValid(), True) + self.assertTrue(self.process.IsValid()) if self.TraceOn(): self.runCmd("image list") self.assertEqual(self.target.GetNumModules(), 1) diff --git a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/TestMultipleBinaryCorefile.py b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/TestMultipleBinaryCorefile.py index 0a0bc68646e62..897eab23e05e2 100644 --- a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/TestMultipleBinaryCorefile.py +++ b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/TestMultipleBinaryCorefile.py @@ -45,7 +45,7 @@ def load_corefile_and_test(self): if self.TraceOn(): print("loading corefile %s" % self.corefile) process = target.LoadCore(self.corefile) - self.assertEqual(process.IsValid(), True) + self.assertTrue(process.IsValid()) if self.TraceOn(): print("image list after loading corefile:") self.runCmd("image list") diff --git a/lldb/test/API/macosx/queues/TestQueues.py b/lldb/test/API/macosx/queues/TestQueues.py index f2d15bb5ff15c..45b52af2fbdba 100644 --- a/lldb/test/API/macosx/queues/TestQueues.py +++ b/lldb/test/API/macosx/queues/TestQueues.py @@ -457,9 +457,8 @@ def queues_with_libBacktraceRecording(self): "doing_the_work_2", "queue 2's pending item #0 should be doing_the_work_2", ) - self.assertEqual( + self.assertFalse( queue_performer_2.GetPendingItemAtIndex(9999).IsValid(), - False, "queue 2's pending item #9999 is invalid", ) diff --git a/lldb/test/API/macosx/safe-to-func-call/TestSafeFuncCalls.py b/lldb/test/API/macosx/safe-to-func-call/TestSafeFuncCalls.py index 6a37b250b162d..551cab1269c51 100644 --- a/lldb/test/API/macosx/safe-to-func-call/TestSafeFuncCalls.py +++ b/lldb/test/API/macosx/safe-to-func-call/TestSafeFuncCalls.py @@ -49,8 +49,7 @@ def test_with_python_api(self): main_thread.SafeToCallFunctions(), "It is safe to call functions on the main thread", ) - self.assertEqual( + self.assertFalse( select_thread.SafeToCallFunctions(), - False, "It is not safe to call functions on the select thread", ) diff --git a/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py b/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py index 64e0770dc0b1c..af97493133766 100644 --- a/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py +++ b/lldb/test/API/python_api/interpreter/TestRunCommandInterpreterAPI.py @@ -79,13 +79,13 @@ def test_command_interpreter_run_options(self): opts = lldb.SBCommandInterpreterRunOptions() # Check getters with default values - self.assertEqual(opts.GetStopOnContinue(), False) - self.assertEqual(opts.GetStopOnError(), False) - self.assertEqual(opts.GetStopOnCrash(), False) - self.assertEqual(opts.GetEchoCommands(), True) - self.assertEqual(opts.GetPrintResults(), True) - self.assertEqual(opts.GetPrintErrors(), True) - self.assertEqual(opts.GetAddToHistory(), True) + self.assertFalse(opts.GetStopOnContinue()) + self.assertFalse(opts.GetStopOnError()) + self.assertFalse(opts.GetStopOnCrash()) + self.assertTrue(opts.GetEchoCommands()) + self.assertTrue(opts.GetPrintResults()) + self.assertTrue(opts.GetPrintErrors()) + self.assertTrue(opts.GetAddToHistory()) # Invert values opts.SetStopOnContinue(not opts.GetStopOnContinue()) @@ -97,10 +97,10 @@ def test_command_interpreter_run_options(self): opts.SetAddToHistory(not opts.GetAddToHistory()) # Check the value changed - self.assertEqual(opts.GetStopOnContinue(), True) - self.assertEqual(opts.GetStopOnError(), True) - self.assertEqual(opts.GetStopOnCrash(), True) - self.assertEqual(opts.GetEchoCommands(), False) - self.assertEqual(opts.GetPrintResults(), False) - self.assertEqual(opts.GetPrintErrors(), False) - self.assertEqual(opts.GetAddToHistory(), False) + self.assertTrue(opts.GetStopOnContinue()) + self.assertTrue(opts.GetStopOnError()) + self.assertTrue(opts.GetStopOnCrash()) + self.assertFalse(opts.GetEchoCommands()) + self.assertFalse(opts.GetPrintResults()) + self.assertFalse(opts.GetPrintErrors()) + self.assertFalse(opts.GetAddToHistory()) From 11d115d0569b212dfeb7fe6485be48070e068e19 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 22 Feb 2024 11:05:06 +0800 Subject: [PATCH 176/351] [RISCV] Adjust test case to show wrong stride. NFC See https://github.com/llvm/llvm-project/pull/82506#discussion_r1498080785 --- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 1724b48dd6be9..60eec356773bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -15093,24 +15093,24 @@ define <32 x i64> @mgather_strided_split(ptr %base) { define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ; RV32V-LABEL: masked_gather_widen_sew_negative_stride: ; RV32V: # %bb.0: -; RV32V-NEXT: addi a0, a0, -128 -; RV32V-NEXT: li a1, -128 +; RV32V-NEXT: addi a0, a0, -120 +; RV32V-NEXT: li a1, 120 ; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32V-NEXT: vlse64.v v8, (a0), a1 ; RV32V-NEXT: ret ; ; RV64V-LABEL: masked_gather_widen_sew_negative_stride: ; RV64V: # %bb.0: -; RV64V-NEXT: addi a0, a0, -128 -; RV64V-NEXT: li a1, -128 +; RV64V-NEXT: addi a0, a0, -120 +; RV64V-NEXT: li a1, 120 ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vlse64.v v8, (a0), a1 ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lui a1, 16392 -; RV32ZVE32F-NEXT: addi a1, a1, 1152 +; RV32ZVE32F-NEXT: lui a1, 16393 +; RV32ZVE32F-NEXT: addi a1, a1, -888 ; RV32ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.s.x v9, a1 ; RV32ZVE32F-NEXT: vluxei8.v v8, (a0), v9 @@ -15118,8 +15118,8 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ; ; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi a1, a0, 128 -; RV64ZVE32F-NEXT: lw a2, 132(a0) +; RV64ZVE32F-NEXT: addi a1, a0, 136 +; RV64ZVE32F-NEXT: lw a2, 140(a0) ; RV64ZVE32F-NEXT: lw a3, 0(a0) ; RV64ZVE32F-NEXT: lw a0, 4(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -15128,7 +15128,7 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: ret - %ptrs = getelementptr i32, ptr %base, <4 x i64> + %ptrs = getelementptr i32, ptr %base, <4 x i64> %x = call <4 x i32> @llvm.masked.gather.v4i32.v32p0(<4 x ptr> %ptrs, i32 8, <4 x i1> shufflevector(<4 x i1> insertelement(<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer), <4 x i32> poison) ret <4 x i32> %x } From 7e1432f1258e229a4fcc9c017937166f0578e1f8 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Wed, 21 Feb 2024 19:26:43 -0800 Subject: [PATCH 177/351] [lldb] Standardize command option parsing error messages (#82273) I have been looking to simplify parsing logic and improve the interfaces so that they are both easier to use and harder to abuse. To be specific, I am referring to functions such as `OptionArgParser::ToBoolean`: I would like to go from its current interface to something more like `llvm::Error ToBoolean(llvm::StringRef option_arg)`. Through working on that, I encountered 2 inconveniences: 1. Option parsing code is not uniform. Every function writes a slightly different error message, so incorporating an error message from the `ToBoolean` implementation is going to be laborious as I figure out what exactly needs to change or stay the same. 2. Changing the interface of `ToBoolean` would require a global atomic change across all of the Command code. This would be quite frustrating to do because of the non-uniformity of our existing code. To address these frustrations, I think it would be easiest to first standardize the error reporting mechanism when parsing options in commands. I do so by introducing `CreateOptionParsingError` which will create an error message of the shape: Invalid value ('${option_arg}') for -${short_value} ('${long_value}'): ${additional_context} Concretely, it would look something like this: (lldb) breakpoint set -n main -G yay error: Invalid value ('yay') for -G (auto-continue): Failed to parse as boolean After this, updating the interfaces for parsing the values themselves should become simpler. Because this can be adopted incrementally, this should be able to done over the course of time instead of all at once as a giant difficult-to-review change. I've changed exactly one function where this function would be used as an illustration of what I am proposing. --- lldb/include/lldb/Interpreter/Options.h | 33 +++++++++++++++++ .../Commands/CommandObjectBreakpoint.cpp | 37 ++++++++++--------- lldb/source/Interpreter/Options.cpp | 13 +++++++ lldb/unittests/Interpreter/CMakeLists.txt | 1 + lldb/unittests/Interpreter/TestOptions.cpp | 29 +++++++++++++++ 5 files changed, 96 insertions(+), 17 deletions(-) create mode 100644 lldb/unittests/Interpreter/TestOptions.cpp diff --git a/lldb/include/lldb/Interpreter/Options.h b/lldb/include/lldb/Interpreter/Options.h index bf74927cf99db..18a87e49deee5 100644 --- a/lldb/include/lldb/Interpreter/Options.h +++ b/lldb/include/lldb/Interpreter/Options.h @@ -336,6 +336,39 @@ class OptionGroupOptions : public Options { bool m_did_finalize = false; }; +/// Creates an error that represents the failure to parse an command line option +/// argument. This creates an error containing all information needed to show +/// the developer what went wrong when parsing their command. It is recommended +/// to use this instead of writing an error by hand. +/// +/// \param[in] option_arg +/// The argument that was attempted to be parsed. +/// +/// \param[in] short_option +/// The short form of the option. For example, if the flag is -f, the short +/// option is "f". +/// +/// \param[in] long_option +/// The long form of the option. This field is optional. If the flag is +/// --force, then the long option is "force". +/// +/// \param[in] additional_context +/// This is extra context that will get included in the error. This field is +/// optional. +/// +/// \return +/// An llvm::Error that contains a standardized format for what went wrong +/// when parsing and why. +llvm::Error CreateOptionParsingError(llvm::StringRef option_arg, + const char short_option, + llvm::StringRef long_option = {}, + llvm::StringRef additional_context = {}); + +static constexpr llvm::StringLiteral g_bool_parsing_error_message = + "Failed to parse as boolean"; +static constexpr llvm::StringLiteral g_int_parsing_error_message = + "Failed to parse as integer"; + } // namespace lldb_private #endif // LLDB_INTERPRETER_OPTIONS_H diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp index 3fdf5cd3cd43d..fc2217608a0bb 100644 --- a/lldb/source/Commands/CommandObjectBreakpoint.cpp +++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp @@ -64,6 +64,8 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { Status error; const int short_option = g_breakpoint_modify_options[option_idx].short_option; + const char *long_option = + g_breakpoint_modify_options[option_idx].long_option; switch (short_option) { case 'c': @@ -84,18 +86,17 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { case 'G': { bool value, success; value = OptionArgParser::ToBoolean(option_arg, false, &success); - if (success) { + if (success) m_bp_opts.SetAutoContinue(value); - } else - error.SetErrorStringWithFormat( - "invalid boolean value '%s' passed for -G option", - option_arg.str().c_str()); + else + error = CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message); } break; case 'i': { uint32_t ignore_count; if (option_arg.getAsInteger(0, ignore_count)) - error.SetErrorStringWithFormat("invalid ignore count '%s'", - option_arg.str().c_str()); + error = CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message); else m_bp_opts.SetIgnoreCount(ignore_count); } break; @@ -105,27 +106,29 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { if (success) { m_bp_opts.SetOneShot(value); } else - error.SetErrorStringWithFormat( - "invalid boolean value '%s' passed for -o option", - option_arg.str().c_str()); + error = CreateOptionParsingError(option_arg, short_option, long_option, + g_bool_parsing_error_message); } break; case 't': { lldb::tid_t thread_id = LLDB_INVALID_THREAD_ID; if (option_arg == "current") { if (!execution_context) { - error.SetErrorStringWithFormat("No context to determine current " - "thread"); + error = CreateOptionParsingError( + option_arg, short_option, long_option, + "No context to determine current thread"); } else { ThreadSP ctx_thread_sp = execution_context->GetThreadSP(); if (!ctx_thread_sp || !ctx_thread_sp->IsValid()) { - error.SetErrorStringWithFormat("No currently selected thread"); + error = + CreateOptionParsingError(option_arg, short_option, long_option, + "No currently selected thread"); } else { thread_id = ctx_thread_sp->GetID(); } } } else if (option_arg.getAsInteger(0, thread_id)) { - error.SetErrorStringWithFormat("invalid thread id string '%s'", - option_arg.str().c_str()); + error = CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message); } if (thread_id != LLDB_INVALID_THREAD_ID) m_bp_opts.SetThreadID(thread_id); @@ -139,8 +142,8 @@ class lldb_private::BreakpointOptionGroup : public OptionGroup { case 'x': { uint32_t thread_index = UINT32_MAX; if (option_arg.getAsInteger(0, thread_index)) { - error.SetErrorStringWithFormat("invalid thread index string '%s'", - option_arg.str().c_str()); + error = CreateOptionParsingError(option_arg, short_option, long_option, + g_int_parsing_error_message); } else { m_bp_opts.GetThreadSpec()->SetIndex(thread_index); } diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp index 89fe69009d903..51b7e6b26b6ef 100644 --- a/lldb/source/Interpreter/Options.cpp +++ b/lldb/source/Interpreter/Options.cpp @@ -1365,3 +1365,16 @@ llvm::Expected Options::Parse(const Args &args, argv.erase(argv.begin(), argv.begin() + OptionParser::GetOptionIndex()); return ReconstituteArgsAfterParsing(argv, args); } + +llvm::Error lldb_private::CreateOptionParsingError( + llvm::StringRef option_arg, const char short_option, + llvm::StringRef long_option, llvm::StringRef additional_context) { + std::string buffer; + llvm::raw_string_ostream stream(buffer); + stream << "Invalid value ('" << option_arg << "') for -" << short_option; + if (!long_option.empty()) + stream << " (" << long_option << ")"; + if (!additional_context.empty()) + stream << ": " << additional_context; + return llvm::createStringError(llvm::inconvertibleErrorCode(), buffer); +} diff --git a/lldb/unittests/Interpreter/CMakeLists.txt b/lldb/unittests/Interpreter/CMakeLists.txt index 5b5268ffe9732..54cea995084d3 100644 --- a/lldb/unittests/Interpreter/CMakeLists.txt +++ b/lldb/unittests/Interpreter/CMakeLists.txt @@ -2,6 +2,7 @@ add_lldb_unittest(InterpreterTests TestCommandPaths.cpp TestCompletion.cpp TestOptionArgParser.cpp + TestOptions.cpp TestOptionValue.cpp TestOptionValueFileColonLine.cpp TestRegexCommand.cpp diff --git a/lldb/unittests/Interpreter/TestOptions.cpp b/lldb/unittests/Interpreter/TestOptions.cpp new file mode 100644 index 0000000000000..93474e3c5713c --- /dev/null +++ b/lldb/unittests/Interpreter/TestOptions.cpp @@ -0,0 +1,29 @@ +//===-- TestOptions.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Interpreter/Options.h" +#include "gtest/gtest.h" + +#include "llvm/Testing/Support/Error.h" + +using namespace lldb_private; + +TEST(OptionsTest, CreateOptionParsingError) { + ASSERT_THAT_ERROR( + CreateOptionParsingError("yippee", 'f', "fun", + "unable to convert 'yippee' to boolean"), + llvm::FailedWithMessage("Invalid value ('yippee') for -f (fun): unable " + "to convert 'yippee' to boolean")); + + ASSERT_THAT_ERROR( + CreateOptionParsingError("52", 'b', "bean-count"), + llvm::FailedWithMessage("Invalid value ('52') for -b (bean-count)")); + + ASSERT_THAT_ERROR(CreateOptionParsingError("c", 'm'), + llvm::FailedWithMessage("Invalid value ('c') for -m")); +} From 05af9c83f3a0d154f73d619ac1361eae05531e5e Mon Sep 17 00:00:00 2001 From: Jason Eckhardt Date: Wed, 21 Feb 2024 21:36:10 -0600 Subject: [PATCH 178/351] [TableGen] Suppress per-HwMode duplicate instructions/tables. (#82567) Currently, for per-HwMode encoding/decoding, those instructions that do not have a HwMode override are duplicated into the decoder tables for all HwModes. This includes inducing multiple tables for instructions that are otherwise unrelated (e.g., different namespace with no overrides at all). This patch adds support to suppress instruction and table duplicates. TableGen option "-gen-disassembler --suppress-per-hwmode-duplicates" enables the suppression (off by default). For one downstream backend with a complicated ISA and major cross-generation encoding differences, this eliminates ~32000 duplicate table entries at the time of this patch. There are legitimate reasons to suppress or not suppress duplicates. If there are relatively few non-overridden related instructions, it can be convenient to pull them into the per-mode tables (only need to decode the per-mode tables, slightly simpler decode function in disassembler). On the other hand, in some backends, the opposite is true or the size is too large to tolerate any duplication in the first place. We let the user decide which makes sense. This is currently off by default, though there is no reason it couldn't be enabled by default. Any existing backends downstream using the per-HwMode feature will function as before. Turning on the feature requires minor modifications to their disassembler due to more/less tables and naming. --- llvm/test/TableGen/HwModeEncodeDecode2.td | 119 ++++++++++++++++++++ llvm/utils/TableGen/DecoderEmitter.cpp | 19 +++- llvm/utils/TableGen/DisassemblerEmitter.cpp | 2 + 3 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 llvm/test/TableGen/HwModeEncodeDecode2.td diff --git a/llvm/test/TableGen/HwModeEncodeDecode2.td b/llvm/test/TableGen/HwModeEncodeDecode2.td new file mode 100644 index 0000000000000..5159501d8148e --- /dev/null +++ b/llvm/test/TableGen/HwModeEncodeDecode2.td @@ -0,0 +1,119 @@ +// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | \ +// RUN: FileCheck %s --check-prefix=DECODER +// RUN: llvm-tblgen -gen-disassembler --suppress-per-hwmode-duplicates -I \ +// RUN: %p/../../include %s | FileCheck %s --check-prefix=DECODER-SUPPRESS + +// Test duplicate table suppression for per-HwMode decoders. + +include "llvm/Target/Target.td" + +def archInstrInfo : InstrInfo { } + +def arch : Target { + let InstructionSet = archInstrInfo; +} + +def Myi32 : Operand { + let DecoderMethod = "DecodeMyi32"; +} + +def HasA : Predicate<"Subtarget->hasA()">; +def HasB : Predicate<"Subtarget->hasB()">; + +def ModeA : HwMode<"+a", [HasA]>; +def ModeB : HwMode<"+b", [HasB]>; + + +def fooTypeEncA : InstructionEncoding { + let Size = 4; + field bits<32> SoftFail = 0; + bits<32> Inst; + bits<8> factor; + let Inst{7...0} = factor; + let Inst{3...2} = 0b11; + let Inst{1...0} = 0b00; +} + +def fooTypeEncB : InstructionEncoding { + let Size = 4; + field bits<32> SoftFail = 0; + bits<32> Inst; + bits<8> factor; + let Inst{15...8} = factor; + let Inst{1...0} = 0b11; +} + +let OutOperandList = (outs) in { + def foo : Instruction { + let InOperandList = (ins i32imm:$factor); + let EncodingInfos = EncodingByHwMode< + [ModeA, ModeB], [fooTypeEncA, fooTypeEncB] + >; + let AsmString = "foo $factor"; + } + + // Encoding not overridden, same namespace: + // In the default case, this instruction is duplicated into both ModeA and + // ModeB decoder tables. + // In the suppressed case, this instruction appears in a single decoder table. + def bar: Instruction { + let InOperandList = (ins i32imm:$factor); + let Size = 4; + bits<32> Inst; + bits<32> SoftFail; + bits<8> factor; + let Inst{31...24} = factor; + let Inst{1...0} = 0b10; + let AsmString = "bar $factor"; + } + + def baz : Instruction { + let InOperandList = (ins i32imm:$factor); + bits<32> Inst; + let EncodingInfos = EncodingByHwMode< + [ModeB], [fooTypeEncA] + >; + let AsmString = "foo $factor"; + } + + // Encoding not overridden, different namespace: + // In the default case, this instruction is duplicated into two Alt decoder + // tables (ModeA and ModeB). + // In the suppressed case, this instruction appears in a single decoder table. + def unrelated: Instruction { + let DecoderNamespace = "Alt"; + let InOperandList = (ins i32imm:$factor); + let Size = 4; + bits<32> Inst; + bits<32> SoftFail; + bits<8> factor; + let Inst{31...24} = factor; + let Inst{1...0} = 0b10; + let AsmString = "unrelated $factor"; + } +} + +// DECODER-LABEL: DecoderTableAlt_ModeA32[] = +// DECODER-DAG: Opcode: unrelated +// DECODER-LABEL: DecoderTableAlt_ModeB32[] = +// DECODER-DAG: Opcode: unrelated +// DECODER-LABEL: DecoderTable_ModeA32[] = +// DECODER-DAG: Opcode: fooTypeEncA:foo +// DECODER-DAG: Opcode: bar +// DECODER-LABEL: DecoderTable_ModeB32[] = +// DECODER-DAG: Opcode: fooTypeEncB:foo +// DECODER-DAG: Opcode: fooTypeEncA:baz +// DECODER-DAG: Opcode: bar + + +// DECODER-SUPPRESS-LABEL: DecoderTableAlt_AllModes32[] = +// DECODER-SUPPRESS-DAG: Opcode: unrelated +// DECODER-SUPPRESS-LABEL: DecoderTable_AllModes32[] = +// DECODER-SUPPRESS-DAG: Opcode: bar +// DECODER-SUPPRESS-LABEL: DecoderTable_ModeA32[] = +// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:foo +// DECODER-SUPPRESS-NOT: Opcode: bar +// DECODER-SUPPRESS-LABEL: DecoderTable_ModeB32[] = +// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncB:foo +// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:baz +// DECODER-SUPPRESS-NOT: Opcode: bar diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index 22a71065134a4..36f437f02cf51 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCDecoderOps.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" @@ -50,6 +51,13 @@ using namespace llvm; #define DEBUG_TYPE "decoder-emitter" +extern cl::OptionCategory DisassemblerEmitterCat; + +cl::opt DecoderEmitterSuppressDuplicates( + "suppress-per-hwmode-duplicates", + cl::desc("Suppress duplication of instrs into per-HwMode decoder tables"), + cl::init(false), cl::cat(DisassemblerEmitterCat)); + namespace { STATISTIC(NumEncodings, "Number of encodings considered"); @@ -2496,10 +2504,15 @@ void DecoderEmitter::run(raw_ostream &o) { } } // This instruction is encoded the same on all HwModes. Emit it for all - // HwModes. - for (StringRef HwModeName : HwModeNames) + // HwModes by default, otherwise leave it in a single common table. + if (DecoderEmitterSuppressDuplicates) { NumberedEncodings.emplace_back(NumberedInstruction->TheDef, - NumberedInstruction, HwModeName); + NumberedInstruction, "AllModes"); + } else { + for (StringRef HwModeName : HwModeNames) + NumberedEncodings.emplace_back(NumberedInstruction->TheDef, + NumberedInstruction, HwModeName); + } } for (const auto &NumberedAlias : RK.getAllDerivedDefinitions("AdditionalEncoding")) diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp index ae6a8ef22bc8c..2d653af4d3025 100644 --- a/llvm/utils/TableGen/DisassemblerEmitter.cpp +++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp @@ -131,5 +131,7 @@ static void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) { EmitDecoder(Records, OS, PredicateNamespace); } +cl::OptionCategory DisassemblerEmitterCat("Options for -gen-disassembler"); + static TableGen::Emitter::Opt X("gen-disassembler", EmitDisassembler, "Generate disassembler"); From 815644b4dd882ade2e5649d4f97c3dd6f7aea200 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 22 Feb 2024 11:50:27 +0800 Subject: [PATCH 179/351] [RISCV] Fix mgather -> riscv.masked.strided.load combine not extending indices (#82506) This fixes the miscompile reported in #82430 by telling isSimpleVIDSequence to sign extend to XLen instead of the width of the indices, since the "sequence" of indices generated by a strided load will be at XLen. This was the simplest way I could think of getting isSimpleVIDSequence to treat the indexes as if they were zero extended to XLenVT. Another way we could do this is by refactoring out the "get constant integers" part from isSimpleVIDSequence and handle them as APInts so we can separately zero extend it. Fixes #82430 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 20 +++++++++++-------- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 12 ++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f7275eb7c77bb..75be97ff32bbe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3240,7 +3240,8 @@ static std::optional getExactInteger(const APFloat &APF, // Note that this method will also match potentially unappealing index // sequences, like , however it is left to the caller to // determine whether this is worth generating code for. -static std::optional isSimpleVIDSequence(SDValue Op) { +static std::optional isSimpleVIDSequence(SDValue Op, + unsigned EltSizeInBits) { unsigned NumElts = Op.getNumOperands(); assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR"); bool IsInteger = Op.getValueType().isInteger(); @@ -3248,7 +3249,7 @@ static std::optional isSimpleVIDSequence(SDValue Op) { std::optional SeqStepDenom; std::optional SeqStepNum, SeqAddend; std::optional> PrevElt; - unsigned EltSizeInBits = Op.getValueType().getScalarSizeInBits(); + assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits()); for (unsigned Idx = 0; Idx < NumElts; Idx++) { // Assume undef elements match the sequence; we just have to be careful // when interpolating across them. @@ -3261,14 +3262,14 @@ static std::optional isSimpleVIDSequence(SDValue Op) { if (!isa(Op.getOperand(Idx))) return std::nullopt; Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes(EltSizeInBits); + maskTrailingOnes(Op.getScalarValueSizeInBits()); } else { // The BUILD_VECTOR must be all constants. if (!isa(Op.getOperand(Idx))) return std::nullopt; if (auto ExactInteger = getExactInteger( cast(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits)) + Op.getScalarValueSizeInBits())) Val = *ExactInteger; else return std::nullopt; @@ -3324,11 +3325,11 @@ static std::optional isSimpleVIDSequence(SDValue Op) { uint64_t Val; if (IsInteger) { Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes(EltSizeInBits); + maskTrailingOnes(Op.getScalarValueSizeInBits()); } else { Val = *getExactInteger( cast(Op.getOperand(Idx))->getValueAPF(), - EltSizeInBits); + Op.getScalarValueSizeInBits()); } uint64_t ExpectedVal = (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom; @@ -3598,7 +3599,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, // Try and match index sequences, which we can lower to the vid instruction // with optional modifications. An all-undef vector is matched by // getSplatValue, above. - if (auto SimpleVID = isSimpleVIDSequence(Op)) { + if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) { int64_t StepNumerator = SimpleVID->StepNumerator; unsigned StepDenominator = SimpleVID->StepDenominator; int64_t Addend = SimpleVID->Addend; @@ -15978,7 +15979,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, if (Index.getOpcode() == ISD::BUILD_VECTOR && MGN->getExtensionType() == ISD::NON_EXTLOAD && isTypeLegal(VT)) { - if (std::optional SimpleVID = isSimpleVIDSequence(Index); + // The sequence will be XLenVT, not the type of Index. Tell + // isSimpleVIDSequence this so we avoid overflow. + if (std::optional SimpleVID = + isSimpleVIDSequence(Index, Subtarget.getXLen()); SimpleVID && SimpleVID->StepDenominator == 1) { const int64_t StepNumerator = SimpleVID->StepNumerator; const int64_t Addend = SimpleVID->Addend; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 60eec356773bf..88c299a19fb4e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -15086,23 +15086,19 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ret <32 x i64> %x } -; FIXME: This is a miscompile triggered by the mgather -> -; riscv.masked.strided.load combine. In order for it to trigger we need either a -; strided gather that RISCVGatherScatterLowering doesn't pick up, or a new -; strided gather generated by the widening sew combine. define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ; RV32V-LABEL: masked_gather_widen_sew_negative_stride: ; RV32V: # %bb.0: -; RV32V-NEXT: addi a0, a0, -120 -; RV32V-NEXT: li a1, 120 +; RV32V-NEXT: addi a0, a0, 136 +; RV32V-NEXT: li a1, -136 ; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32V-NEXT: vlse64.v v8, (a0), a1 ; RV32V-NEXT: ret ; ; RV64V-LABEL: masked_gather_widen_sew_negative_stride: ; RV64V: # %bb.0: -; RV64V-NEXT: addi a0, a0, -120 -; RV64V-NEXT: li a1, 120 +; RV64V-NEXT: addi a0, a0, 136 +; RV64V-NEXT: li a1, -136 ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vlse64.v v8, (a0), a1 ; RV64V-NEXT: ret From db7e9e68411de074dee78c92657e983da4b89500 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 21 Feb 2024 20:59:42 -0800 Subject: [PATCH 180/351] [TypeProf][InstrPGO] Introduce raw and instr profile format change for type profiling. (#81691) * Raw profile format - Header: records the byte size of compressed vtable names, and the number of profiled vtable entries (call it `VTableProfData`). Header also records padded bytes of each section. - Payload: adds a section for compressed vtable names, and a section to store `VTableProfData`. Both sections are padded so the size is a multiple of 8. * Indexed profile format - Header: records the byte offset of compressed vtable names. - Payload: adds a section to store compressed vtable names. This section is used by `llvm-profdata` to show the list of vtables profiled for an instrumented site. [The originally reviewed patch](https://github.com/llvm/llvm-project/pull/66825) will have profile reader/write change and llvm-profdata change. - To ensure this PR has all the necessary profile format change along with profile version bump, created a copy of the originally reviewed patch in https://github.com/llvm/llvm-project/pull/80761. The copy doesn't have profile format change, but it has the set of tests which covers type profile generation, profile read and profile merge. Tests pass there. rfc in https://discourse.llvm.org/t/rfc-dynamic-type-profiling-and-optimizations-in-llvm/74600 --------- Co-authored-by: modiking --- compiler-rt/include/profile/InstrProfData.inc | 50 ++++++++- compiler-rt/lib/profile/InstrProfiling.h | 35 +++++-- .../lib/profile/InstrProfilingBuffer.c | 96 +++++++++++++++--- .../lib/profile/InstrProfilingInternal.h | 8 +- compiler-rt/lib/profile/InstrProfilingMerge.c | 23 ++++- .../lib/profile/InstrProfilingPlatformLinux.c | 20 ++++ .../lib/profile/InstrProfilingWriter.c | 37 +++++-- .../profile/instrprof-write-buffer-internal.c | 6 +- llvm/include/llvm/ProfileData/InstrProf.h | 17 +++- .../llvm/ProfileData/InstrProfData.inc | 50 ++++++++- .../llvm/ProfileData/InstrProfReader.h | 13 +++ llvm/lib/ProfileData/InstrProf.cpp | 11 +- llvm/lib/ProfileData/InstrProfReader.cpp | 44 +++++++- llvm/lib/ProfileData/InstrProfWriter.cpp | 42 ++++++-- .../InstrProfiling/coverage.ll | 8 +- .../thinlto_indirect_call_promotion.profraw | Bin 528 -> 544 bytes .../Transforms/PGOProfile/comdat_internal.ll | 4 +- .../llvm-profdata/Inputs/c-general.profraw | Bin 2016 -> 2032 bytes .../llvm-profdata/Inputs/compressed.profraw | Bin 1968 -> 1984 bytes .../thinlto_indirect_call_promotion.profraw | Bin 0 -> 528 bytes .../llvm-profdata/binary-ids-padding.test | 6 +- .../llvm-profdata/large-binary-id-size.test | 4 +- ...alformed-not-space-for-another-header.test | 6 +- .../malformed-num-counters-zero.test | 6 +- .../malformed-ptr-to-counter-array.test | 6 +- .../misaligned-binary-ids-size.test | 4 +- .../mismatched-raw-profile-header.test | 2 + .../tools/llvm-profdata/raw-32-bits-be.test | 11 +- .../tools/llvm-profdata/raw-32-bits-le.test | 10 +- .../tools/llvm-profdata/raw-64-bits-be.test | 10 +- .../tools/llvm-profdata/raw-64-bits-le.test | 10 +- .../tools/llvm-profdata/raw-two-profiles.test | 8 +- 32 files changed, 458 insertions(+), 89 deletions(-) create mode 100644 llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index c907a9736f316..1f77853bb8baa 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \ #undef INSTR_PROF_DATA /* INSTR_PROF_DATA end. */ +/* For a virtual table object, record the name hash to associate profiled + * addresses with global variables, and record {starting address, size in bytes} + * to map the profiled virtual table (which usually have an offset from the + * starting address) back to a virtual table object. */ +#ifndef INSTR_PROF_VTABLE_DATA +#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_VTABLE_DATA_DEFINED +#endif +INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \ + VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \ + IndexedInstrProf::ComputeHash(PGOVTableName))) +INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \ + VTablePointer, VTableAddr) +INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \ + ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \ + VTableSizeVal)) +#undef INSTR_PROF_VTABLE_DATA +/* INSTR_PROF_VTABLE_DATA end. */ /* This is an internal data structure used by value profiler. It * is defined here to allow serialization code sharing by LLVM @@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta, (uintptr_t)BitmapBegin - (uintptr_t)DataBegin) INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) +INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) +INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) #undef INSTR_PROF_RAW_HEADER /* INSTR_PROF_RAW_HEADER end */ @@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target") /* For memory intrinsic functions size profiling. */ VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size") +/* For virtual table address profiling, the address point of the virtual table + * (i.e., the address contained in objects pointing to a virtual table) are + * profiled. Note this may not be the address of the per C++ class virtual table + * object (e.g., there might be an offset). + * + * The profiled addresses are stored in raw profile, together with the following + * two types of information. + * 1. The (starting and ending) addresses of per C++ class virtual table objects. + * 2. The (compressed) virtual table object names. + * RawInstrProfReader converts profiled virtual table addresses to virtual table + * objects' MD5 hash. + */ +VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable") /* These two kinds must be the last to be * declared. This is to make sure the string * array created with the template can be * indexed with the kind value. */ VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first") -VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last") +VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last") #undef VALUE_PROF_KIND /* VALUE_PROF_KIND end */ @@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \ INSTR_PROF_SECT_ENTRY(IPSK_name, \ INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \ INSTR_PROF_NAME_COFF, "__DATA,") +INSTR_PROF_SECT_ENTRY(IPSK_vname, \ + INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \ + INSTR_PROF_VNAME_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vals, \ INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \ INSTR_PROF_VALS_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \ INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \ INSTR_PROF_VNODES_COFF, "__DATA,") +INSTR_PROF_SECT_ENTRY(IPSK_vtab, \ + INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \ + INSTR_PROF_VTAB_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_covmap, \ INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \ INSTR_PROF_COVMAP_COFF, "__LLVM_COV,") @@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129 /* Raw profile format version (start from 1). */ -#define INSTR_PROF_RAW_VERSION 9 +#define INSTR_PROF_RAW_VERSION 10 /* Indexed profile format version (start from 1). */ -#define INSTR_PROF_INDEX_VERSION 11 +#define INSTR_PROF_INDEX_VERSION 12 /* Coverage mapping format version (start from 0). */ #define INSTR_PROF_COVMAP_VERSION 6 @@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data #define INSTR_PROF_NAME_COMMON __llvm_prf_names +#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts #define INSTR_PROF_BITS_COMMON __llvm_prf_bits #define INSTR_PROF_VALS_COMMON __llvm_prf_vals #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds +#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab #define INSTR_PROF_COVMAP_COMMON __llvm_covmap #define INSTR_PROF_COVFUN_COMMON __llvm_covfun #define INSTR_PROF_COVDATA_COMMON __llvm_covdata @@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, */ #define INSTR_PROF_DATA_COFF ".lprfd$M" #define INSTR_PROF_NAME_COFF ".lprfn$M" +#define INSTR_PROF_VNAME_COFF ".lprfvn$M" #define INSTR_PROF_CNTS_COFF ".lprfc$M" #define INSTR_PROF_BITS_COFF ".lprfb$M" #define INSTR_PROF_VALS_COFF ".lprfv$M" #define INSTR_PROF_VNODES_COFF ".lprfnd$M" +#define INSTR_PROF_VTAB_COFF ".lprfvt$M" #define INSTR_PROF_COVMAP_COFF ".lcovmap$M" #define INSTR_PROF_COVFUN_COFF ".lcovfun$M" /* Since cov data and cov names sections are not allocated, we don't need to diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 0123908336918..be694a8d3330b 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -49,6 +49,12 @@ typedef struct ValueProfNode { #include "profile/InstrProfData.inc" } ValueProfNode; +typedef void *IntPtrT; +typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData { +#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name; +#include "profile/InstrProfData.inc" +} VTableProfData; + /*! * \brief Return 1 if profile counters are continuously synced to the raw * profile via an mmap(). This is in contrast to the default mode, in which @@ -103,12 +109,16 @@ const __llvm_profile_data *__llvm_profile_begin_data(void); const __llvm_profile_data *__llvm_profile_end_data(void); const char *__llvm_profile_begin_names(void); const char *__llvm_profile_end_names(void); +const char *__llvm_profile_begin_vtabnames(void); +const char *__llvm_profile_end_vtabnames(void); char *__llvm_profile_begin_counters(void); char *__llvm_profile_end_counters(void); char *__llvm_profile_begin_bitmap(void); char *__llvm_profile_end_bitmap(void); ValueProfNode *__llvm_profile_begin_vnodes(); ValueProfNode *__llvm_profile_end_vnodes(); +VTableProfData *__llvm_profile_begin_vtables(); +VTableProfData *__llvm_profile_end_vtables(); uint32_t *__llvm_profile_begin_orderfile(); /*! @@ -252,20 +262,31 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin, /*! \brief Get the size of the profile name section in bytes. */ uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End); -/* ! \brief Given the sizes of the data and counter information, return the - * number of padding bytes before and after the counters, and after the names, - * in the raw profile. +/*! \brief Get the number of virtual table profile data entries */ +uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, + const VTableProfData *End); + +/*! \brief Get the size of virtual table profile data in bytes. */ +uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin, + const VTableProfData *End); + +/* ! \brief Given the sizes of the data and counter information, computes the + * number of padding bytes before and after the counter section, as well as the + * number of padding bytes after other setions in the raw profile. + * Returns -1 upon errors and 0 upon success. Output parameters should be used + * iff return value is 0. * * Note: When mmap() mode is disabled, no padding bytes before/after counters * are needed. However, in mmap() mode, the counter section in the raw profile * must be page-aligned: this API computes the number of padding bytes * needed to achieve that. */ -void __llvm_profile_get_padding_sizes_for_counters( +int __llvm_profile_get_padding_sizes_for_counters( uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes, - uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters, - uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap, - uint64_t *PaddingBytesAfterNames); + uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize, + uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters, + uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames, + uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames); /*! * \brief Set the flag that profile data has been dumped to the file. diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c index af52804b2b532..7c5c26f4d113b 100644 --- a/compiler-rt/lib/profile/InstrProfilingBuffer.c +++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c @@ -51,16 +51,29 @@ uint64_t __llvm_profile_get_size_for_buffer(void) { const char *BitmapEnd = __llvm_profile_end_bitmap(); const char *NamesBegin = __llvm_profile_begin_names(); const char *NamesEnd = __llvm_profile_end_names(); + const VTableProfData *VTableBegin = __llvm_profile_begin_vtables(); + const VTableProfData *VTableEnd = __llvm_profile_end_vtables(); + const char *VNamesBegin = __llvm_profile_begin_vtabnames(); + const char *VNamesEnd = __llvm_profile_end_vtabnames(); return __llvm_profile_get_size_for_buffer_internal( DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd, - NamesBegin, NamesEnd); + NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd); } COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin, const __llvm_profile_data *End) { intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End; + // `sizeof(__llvm_profile_data) - 1` is required in the numerator when + // [Begin, End] represents an inclusive range. + // For ELF, [Begin, End) represents the address of linker-inserted + // symbols `__start__` and `__stop_`. + // Thereby, `End` is one byte past the inclusive range, and + // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get + // the correct number of profile data. + // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true + // across platforms. return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) / sizeof(__llvm_profile_data); } @@ -71,6 +84,26 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin, return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data); } +// Counts the number of `VTableProfData` elements within the range of [Begin, +// End). Caller should guarantee that End points to one byte past the inclusive +// range. +// FIXME: Add a compiler-rt test to make sure the number of vtables in the +// raw profile is the same as the number of vtable elements in the instrumented +// binary. +COMPILER_RT_VISIBILITY +uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, + const VTableProfData *End) { + // Convert pointers to intptr_t to use integer arithmetic. + intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin; + return (EndI - BeginI) / sizeof(VTableProfData); +} + +COMPILER_RT_VISIBILITY +uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin, + const VTableProfData *End) { + return (intptr_t)(End) - (intptr_t)(Begin); +} + COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) { if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE) return sizeof(uint8_t); @@ -119,11 +152,13 @@ static int needsCounterPadding(void) { } COMPILER_RT_VISIBILITY -void __llvm_profile_get_padding_sizes_for_counters( +int __llvm_profile_get_padding_sizes_for_counters( uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes, - uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters, - uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes, - uint64_t *PaddingBytesAfterNames) { + uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize, + uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters, + uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames, + uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) { + // Counter padding is needed only if continuous mode is enabled. if (!needsCounterPadding()) { *PaddingBytesBeforeCounters = 0; *PaddingBytesAfterCounters = @@ -131,9 +166,19 @@ void __llvm_profile_get_padding_sizes_for_counters( *PaddingBytesAfterBitmapBytes = __llvm_profile_get_num_padding_bytes(NumBitmapBytes); *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize); - return; + if (PaddingBytesAfterVTable != NULL) + *PaddingBytesAfterVTable = + __llvm_profile_get_num_padding_bytes(VTableSize); + if (PaddingBytesAfterVName != NULL) + *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize); + return 0; } + // Value profiling not supported in continuous mode at profile-write time. + // Return -1 to alert the incompatibility. + if (VTableSize != 0 || VNameSize != 0) + return -1; + // In continuous mode, the file offsets for headers and for the start of // counter sections need to be page-aligned. *PaddingBytesBeforeCounters = @@ -142,13 +187,22 @@ void __llvm_profile_get_padding_sizes_for_counters( *PaddingBytesAfterBitmapBytes = calculateBytesNeededToPageAlign(NumBitmapBytes); *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize); + // Set these two variables to zero to avoid uninitialized variables + // even if VTableSize and VNameSize are known to be zero. + if (PaddingBytesAfterVTable != NULL) + *PaddingBytesAfterVTable = 0; + if (PaddingBytesAfterVName != NULL) + *PaddingBytesAfterVName = 0; + return 0; } COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_size_for_buffer_internal( const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, - const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) { + const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd, + const VTableProfData *VTableBegin, const VTableProfData *VTableEnd, + const char *VNamesBegin, const char *VNamesEnd) { /* Match logic in __llvm_profile_write_buffer(). */ const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char); uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd); @@ -156,20 +210,29 @@ uint64_t __llvm_profile_get_size_for_buffer_internal( __llvm_profile_get_counters_size(CountersBegin, CountersEnd); const uint64_t NumBitmapBytes = __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd); + const uint64_t VTableSize = + __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd); + const uint64_t VNameSize = + __llvm_profile_get_name_size(VNamesBegin, VNamesEnd); /* Determine how much padding is needed before/after the counters and after * the names. */ uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters, - PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes; + PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes, + PaddingBytesAfterVTable, PaddingBytesAfterVNames; __llvm_profile_get_padding_sizes_for_counters( - DataSize, CountersSize, NumBitmapBytes, NamesSize, - &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters, - &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames); + DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */, + 0 /* VNameSize */, &PaddingBytesBeforeCounters, + &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes, + &PaddingBytesAfterNames, &PaddingBytesAfterVTable, + &PaddingBytesAfterVNames); return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) + DataSize + PaddingBytesBeforeCounters + CountersSize + PaddingBytesAfterCounters + NumBitmapBytes + - PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames; + PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames + + VTableSize + PaddingBytesAfterVTable + VNameSize + + PaddingBytesAfterVNames; } COMPILER_RT_VISIBILITY @@ -191,7 +254,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal( const char *NamesBegin, const char *NamesEnd) { ProfDataWriter BufferWriter; initBufferWriter(&BufferWriter, Buffer); - return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin, - CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin, - NamesEnd, 0); + // Set virtual table arguments to NULL since they are not supported yet. + return lprofWriteDataImpl( + &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, + BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd, + /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL, + /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0); } diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h index 03ed67fcfa766..d5bd0e41fb129 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.h +++ b/compiler-rt/lib/profile/InstrProfilingInternal.h @@ -22,7 +22,9 @@ uint64_t __llvm_profile_get_size_for_buffer_internal( const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, - const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd); + const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd, + const VTableProfData *VTableBegin, const VTableProfData *VTableEnd, + const char *VNamesBegin, const char *VNamesEnd); /*! * \brief Write instrumentation data to the given buffer, given explicit @@ -156,7 +158,9 @@ int lprofWriteDataImpl(ProfDataWriter *Writer, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, VPDataReaderType *VPDataReader, const char *NamesBegin, - const char *NamesEnd, int SkipNameDataWrite); + const char *NamesEnd, const VTableProfData *VTableBegin, + const VTableProfData *VTableEnd, const char *VNamesBegin, + const char *VNamesEnd, int SkipNameDataWrite); /* Merge value profile data pointed to by SrcValueProfData into * in-memory profile counters pointed by to DstData. */ diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c index b5850e99ee37d..c0706b73e1668 100644 --- a/compiler-rt/lib/profile/InstrProfilingMerge.c +++ b/compiler-rt/lib/profile/InstrProfilingMerge.c @@ -107,6 +107,26 @@ static uintptr_t signextIfWin64(void *V) { #endif } +// Skip names section, vtable profile data section and vtable names section +// for runtime profile merge. To merge runtime addresses from multiple +// profiles collected from the same instrumented binary, the binary should be +// loaded at fixed base address (e.g., build with -no-pie, or run with ASLR +// disabled). In this set-up these three sections remain unchanged. +static uint64_t +getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) { + const uint64_t VTableSectionSize = + Header->NumVTables * sizeof(VTableProfData); + const uint64_t PaddingBytesAfterVTableSection = + __llvm_profile_get_num_padding_bytes(VTableSectionSize); + const uint64_t VNamesSize = Header->VNamesSize; + const uint64_t PaddingBytesAfterVNamesSize = + __llvm_profile_get_num_padding_bytes(VNamesSize); + return Header->NamesSize + + __llvm_profile_get_num_padding_bytes(Header->NamesSize) + + VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize + + PaddingBytesAfterVNamesSize; +} + COMPILER_RT_VISIBILITY int __llvm_profile_merge_from_buffer(const char *ProfileData, uint64_t ProfileSize) { @@ -137,8 +157,7 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData, SrcBitmapStart = SrcCountersEnd; SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes; SrcValueProfDataStart = - SrcNameStart + Header->NamesSize + - __llvm_profile_get_num_padding_bytes(Header->NamesSize); + SrcNameStart + getDistanceFromCounterToValueProf(Header); if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart) return 1; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index 19266ab6c6fb8..d2554a2702aaf 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -24,8 +24,12 @@ #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON) #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON) #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON) +#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON) +#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON) #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON) #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON) +#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON) +#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON) #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON) #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON) #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON) @@ -41,6 +45,10 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; @@ -63,6 +71,18 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) { COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) { return &PROF_NAME_STOP; } +COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) { + return &PROF_VNAME_START; +} +COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) { + return &PROF_VNAME_STOP; +} +COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_begin_vtables(void) { + return &PROF_VTABLE_START; +} +COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_end_vtables(void) { + return &PROF_VTABLE_STOP; +} COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) { return &PROF_CNTS_START; } diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c index 4d767d1385148..8816a71155511 100644 --- a/compiler-rt/lib/profile/InstrProfilingWriter.c +++ b/compiler-rt/lib/profile/InstrProfilingWriter.c @@ -250,9 +250,14 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer, const char *BitmapEnd = __llvm_profile_end_bitmap(); const char *NamesBegin = __llvm_profile_begin_names(); const char *NamesEnd = __llvm_profile_end_names(); + const VTableProfData *VTableBegin = __llvm_profile_begin_vtables(); + const VTableProfData *VTableEnd = __llvm_profile_end_vtables(); + const char *VNamesBegin = __llvm_profile_begin_vtabnames(); + const char *VNamesEnd = __llvm_profile_end_vtabnames(); return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd, VPDataReader, - NamesBegin, NamesEnd, SkipNameDataWrite); + NamesBegin, NamesEnd, VTableBegin, VTableEnd, + VNamesBegin, VNamesEnd, SkipNameDataWrite); } COMPILER_RT_VISIBILITY int @@ -261,7 +266,9 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, VPDataReaderType *VPDataReader, const char *NamesBegin, - const char *NamesEnd, int SkipNameDataWrite) { + const char *NamesEnd, const VTableProfData *VTableBegin, + const VTableProfData *VTableEnd, const char *VNamesBegin, + const char *VNamesEnd, int SkipNameDataWrite) { /* Calculate size of sections. */ const uint64_t DataSectionSize = __llvm_profile_get_data_size(DataBegin, DataEnd); @@ -273,6 +280,12 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, const uint64_t NumBitmapBytes = __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd); const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd); + const uint64_t NumVTables = + __llvm_profile_get_num_vtable(VTableBegin, VTableEnd); + const uint64_t VTableSectionSize = + __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd); + const uint64_t VNamesSize = + __llvm_profile_get_name_size(VNamesBegin, VNamesEnd); /* Create the header. */ __llvm_profile_header Header; @@ -280,11 +293,15 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, /* Determine how much padding is needed before/after the counters and after * the names. */ uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters, - PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes; - __llvm_profile_get_padding_sizes_for_counters( - DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize, - &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters, - &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames); + PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames, + PaddingBytesAfterVTable, PaddingBytesAfterVNames; + if (__llvm_profile_get_padding_sizes_for_counters( + DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize, + VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters, + &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes, + &PaddingBytesAfterNames, &PaddingBytesAfterVTable, + &PaddingBytesAfterVNames) == -1) + return -1; { /* Initialize header structure. */ @@ -323,7 +340,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0}, {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1}, {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0}, - {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}}; + {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}, + {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0}, + {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1}, + {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0}, + {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}}; if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData))) return -1; diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c index d9670f739ca98..2c1c29ac0c588 100644 --- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c +++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c @@ -31,7 +31,8 @@ char *__llvm_profile_end_bitmap(void); uint64_t __llvm_profile_get_size_for_buffer_internal( const void *DataBegin, const void *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, - const char *NamesBegin, const char *NamesEnd); + const char *NamesBegin, const char *NamesEnd, const void *VTableBegin, + const void *VTableEnd, const char *VNamesBegin, const char *VNamesEnd); int __llvm_profile_write_buffer_internal( char *Buffer, const void *DataBegin, const void *DataEnd, @@ -45,7 +46,8 @@ int main(int argc, const char *argv[]) { __llvm_profile_begin_data(), __llvm_profile_end_data(), __llvm_profile_begin_counters(), __llvm_profile_end_counters(), __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(), - __llvm_profile_begin_names(), __llvm_profile_end_names()); + __llvm_profile_begin_names(), __llvm_profile_end_names(), NULL, NULL, + NULL, NULL); char *buf = malloc(bufsize); int ret = __llvm_profile_write_buffer_internal( diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index a928ba6961f36..25ec06a739202 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -831,6 +831,7 @@ struct InstrProfRecord { struct ValueProfData { std::vector IndirectCallSites; std::vector MemOPSizes; + std::vector VTableTargets; }; std::unique_ptr ValueData; @@ -853,6 +854,8 @@ struct InstrProfRecord { return ValueData->IndirectCallSites; case IPVK_MemOPSize: return ValueData->MemOPSizes; + case IPVK_VTableTarget: + return ValueData->VTableTargets; default: llvm_unreachable("Unknown value kind!"); } @@ -1036,7 +1039,9 @@ enum ProfVersion { Version10 = 10, // An additional field is used for bitmap bytes. Version11 = 11, - // The current version is 11. + // VTable profiling, + Version12 = 12, + // The current version is 12. CurrentVersion = INSTR_PROF_INDEX_VERSION }; const uint64_t Version = ProfVersion::CurrentVersion; @@ -1057,6 +1062,7 @@ struct Header { uint64_t MemProfOffset; uint64_t BinaryIdOffset; uint64_t TemporalProfTracesOffset; + uint64_t VTableNamesOffset; // New fields should only be added at the end to ensure that the size // computation is correct. The methods below need to be updated to ensure that // the new field is read correctly. @@ -1193,8 +1199,13 @@ template <> inline uint64_t getMagic() { // It should also match the synthesized type in // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters. template struct alignas(8) ProfileData { - #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name; - #include "llvm/ProfileData/InstrProfData.inc" +#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name; +#include "llvm/ProfileData/InstrProfData.inc" +}; + +template struct alignas(8) VTableProfileData { +#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name; +#include "llvm/ProfileData/InstrProfData.inc" }; // File header structure of the LLVM profile data in raw format. diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index c907a9736f316..1f77853bb8baa 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \ #undef INSTR_PROF_DATA /* INSTR_PROF_DATA end. */ +/* For a virtual table object, record the name hash to associate profiled + * addresses with global variables, and record {starting address, size in bytes} + * to map the profiled virtual table (which usually have an offset from the + * starting address) back to a virtual table object. */ +#ifndef INSTR_PROF_VTABLE_DATA +#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_VTABLE_DATA_DEFINED +#endif +INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \ + VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \ + IndexedInstrProf::ComputeHash(PGOVTableName))) +INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \ + VTablePointer, VTableAddr) +INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \ + ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \ + VTableSizeVal)) +#undef INSTR_PROF_VTABLE_DATA +/* INSTR_PROF_VTABLE_DATA end. */ /* This is an internal data structure used by value profiler. It * is defined here to allow serialization code sharing by LLVM @@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta, (uintptr_t)BitmapBegin - (uintptr_t)DataBegin) INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) +INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) +INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) #undef INSTR_PROF_RAW_HEADER /* INSTR_PROF_RAW_HEADER end */ @@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target") /* For memory intrinsic functions size profiling. */ VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size") +/* For virtual table address profiling, the address point of the virtual table + * (i.e., the address contained in objects pointing to a virtual table) are + * profiled. Note this may not be the address of the per C++ class virtual table + * object (e.g., there might be an offset). + * + * The profiled addresses are stored in raw profile, together with the following + * two types of information. + * 1. The (starting and ending) addresses of per C++ class virtual table objects. + * 2. The (compressed) virtual table object names. + * RawInstrProfReader converts profiled virtual table addresses to virtual table + * objects' MD5 hash. + */ +VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable") /* These two kinds must be the last to be * declared. This is to make sure the string * array created with the template can be * indexed with the kind value. */ VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first") -VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last") +VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last") #undef VALUE_PROF_KIND /* VALUE_PROF_KIND end */ @@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \ INSTR_PROF_SECT_ENTRY(IPSK_name, \ INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \ INSTR_PROF_NAME_COFF, "__DATA,") +INSTR_PROF_SECT_ENTRY(IPSK_vname, \ + INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \ + INSTR_PROF_VNAME_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vals, \ INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \ INSTR_PROF_VALS_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \ INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \ INSTR_PROF_VNODES_COFF, "__DATA,") +INSTR_PROF_SECT_ENTRY(IPSK_vtab, \ + INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \ + INSTR_PROF_VTAB_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_covmap, \ INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \ INSTR_PROF_COVMAP_COFF, "__LLVM_COV,") @@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129 /* Raw profile format version (start from 1). */ -#define INSTR_PROF_RAW_VERSION 9 +#define INSTR_PROF_RAW_VERSION 10 /* Indexed profile format version (start from 1). */ -#define INSTR_PROF_INDEX_VERSION 11 +#define INSTR_PROF_INDEX_VERSION 12 /* Coverage mapping format version (start from 0). */ #define INSTR_PROF_COVMAP_VERSION 6 @@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data #define INSTR_PROF_NAME_COMMON __llvm_prf_names +#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts #define INSTR_PROF_BITS_COMMON __llvm_prf_bits #define INSTR_PROF_VALS_COMMON __llvm_prf_vals #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds +#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab #define INSTR_PROF_COVMAP_COMMON __llvm_covmap #define INSTR_PROF_COVFUN_COMMON __llvm_covfun #define INSTR_PROF_COVDATA_COMMON __llvm_covdata @@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, */ #define INSTR_PROF_DATA_COFF ".lprfd$M" #define INSTR_PROF_NAME_COFF ".lprfn$M" +#define INSTR_PROF_VNAME_COFF ".lprfvn$M" #define INSTR_PROF_CNTS_COFF ".lprfc$M" #define INSTR_PROF_BITS_COFF ".lprfb$M" #define INSTR_PROF_VALS_COFF ".lprfv$M" #define INSTR_PROF_VNODES_COFF ".lprfnd$M" +#define INSTR_PROF_VTAB_COFF ".lprfvt$M" #define INSTR_PROF_COVMAP_COFF ".lcovmap$M" #define INSTR_PROF_COVFUN_COFF ".lcovfun$M" /* Since cov data and cov names sections are not allocated, we don't need to diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h index 87f15639a2c3c..cfde5d3fc77d6 100644 --- a/llvm/include/llvm/ProfileData/InstrProfReader.h +++ b/llvm/include/llvm/ProfileData/InstrProfReader.h @@ -326,12 +326,16 @@ class RawInstrProfReader : public InstrProfReader { uint64_t NamesDelta; const RawInstrProf::ProfileData *Data; const RawInstrProf::ProfileData *DataEnd; + const RawInstrProf::VTableProfileData *VTableBegin = nullptr; + const RawInstrProf::VTableProfileData *VTableEnd = nullptr; const char *CountersStart; const char *CountersEnd; const char *BitmapStart; const char *BitmapEnd; const char *NamesStart; const char *NamesEnd; + const char *VNamesStart = nullptr; + const char *VNamesEnd = nullptr; // After value profile is all read, this pointer points to // the header of next profile data (if exists) const uint8_t *ValueDataStart; @@ -656,6 +660,15 @@ class IndexedInstrProfReader : public InstrProfReader { std::unique_ptr MemProfRecordTable; /// MemProf frame profile data on-disk indexed via frame id. std::unique_ptr MemProfFrameTable; + /// VTableNamePtr points to the beginning of compressed vtable names. + /// When a symtab is constructed from profiles by llvm-profdata, the list of + /// names could be decompressed based on `VTableNamePtr` and + /// `CompressedVTableNamesLen`. + /// A compiler that reads indexed profiles could construct symtab from module + /// IR so it doesn't need the decompressed names. + const char *VTableNamePtr = nullptr; + /// The length of compressed vtable names. + uint64_t CompressedVTableNamesLen = 0; /// Total size of binary ids. uint64_t BinaryIdsSize{0}; /// Start address of binary id length and data pairs. diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 2eeeff987399d..b9afee413853e 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -1533,9 +1533,12 @@ Expected
Header::readFromBuffer(const unsigned char *Buffer) { // When a new field is added in the header add a case statement here to // populate it. static_assert( - IndexedInstrProf::ProfVersion::CurrentVersion == Version11, + IndexedInstrProf::ProfVersion::CurrentVersion == Version12, "Please update the reading code below if a new field has been added, " "if not add a case statement to fall through to the latest version."); + case 12ull: + H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset)); + [[fallthrough]]; case 11ull: [[fallthrough]]; case 10ull: @@ -1561,10 +1564,14 @@ size_t Header::size() const { // When a new field is added to the header add a case statement here to // compute the size as offset of the new field + size of the new field. This // relies on the field being added to the end of the list. - static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11, + static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12, "Please update the size computation below if a new field has " "been added to the header, if not add a case statement to " "fall through to the latest version."); + case 12ull: + return offsetOf(&Header::VTableNamesOffset) + + sizeof(Header::VTableNamesOffset); + [[fallthrough]]; case 11ull: [[fallthrough]]; case 10ull: diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 0d8d43daae960..31b742bca14d6 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -366,6 +366,11 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) { return E; Value = IndexedInstrProf::ComputeHash(VD.first); } + } else if (ValueKind == IPVK_VTableTarget) { + if (InstrProfSymtab::isExternalSymbol(VD.first)) + Value = 0; + else + Value = IndexedInstrProf::ComputeHash(VD.first); } else { READ_NUM(VD.first, Value); } @@ -582,10 +587,17 @@ Error RawInstrProfReader::readHeader( auto NumBitmapBytes = swap(Header.NumBitmapBytes); auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes); auto NamesSize = swap(Header.NamesSize); + auto VTableNameSize = swap(Header.VNamesSize); + auto NumVTables = swap(Header.NumVTables); ValueKindLast = swap(Header.ValueKindLast); auto DataSize = NumData * sizeof(RawInstrProf::ProfileData); - auto PaddingSize = getNumPaddingBytes(NamesSize); + auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize); + auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize); + + auto VTableSectionSize = + NumVTables * sizeof(RawInstrProf::VTableProfileData); + auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize); // Profile data starts after profile header and binary ids if exist. ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize; @@ -594,7 +606,12 @@ Error RawInstrProfReader::readHeader( CountersOffset + CountersSize + PaddingBytesAfterCounters; ptrdiff_t NamesOffset = BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes; - ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize; + ptrdiff_t VTableProfDataOffset = + NamesOffset + NamesSize + PaddingBytesAfterNames; + ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize + + PaddingBytesAfterVTableProfData; + ptrdiff_t ValueDataOffset = + VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames; auto *Start = reinterpret_cast(&Header); if (Start + ValueDataOffset > DataBuffer->getBufferEnd()) @@ -614,8 +631,14 @@ Error RawInstrProfReader::readHeader( Data = reinterpret_cast *>( Start + DataOffset); DataEnd = Data + NumData; + VTableBegin = + reinterpret_cast *>( + Start + VTableProfDataOffset); + VTableEnd = VTableBegin + NumVTables; NamesStart = Start + NamesOffset; NamesEnd = NamesStart + NamesSize; + VNamesStart = Start + VTableNameOffset; + VNamesEnd = VNamesStart + VTableNameSize; } CountersStart = Start + CountersOffset; @@ -1260,6 +1283,23 @@ Error IndexedInstrProfReader::readHeader() { "corrupted binary ids"); } + if (GET_VERSION(Header->formatVersion()) >= 12) { + uint64_t VTableNamesOffset = + endian::byte_swap( + Header->VTableNamesOffset); + const unsigned char *Ptr = Start + VTableNamesOffset; + + CompressedVTableNamesLen = + support::endian::readNext(Ptr); + + // Writer first writes the length of compressed string, and then the actual + // content. + VTableNamePtr = (const char *)Ptr; + if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd()) + return make_error(instrprof_error::truncated); + } + if (GET_VERSION(Header->formatVersion()) >= 10 && Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) { uint64_t TemporalProfTracesOffset = diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index d65f8fe50313d..e5163ebe8ae37 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -455,12 +455,12 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { Header.MemProfOffset = 0; Header.BinaryIdOffset = 0; Header.TemporalProfTracesOffset = 0; + Header.VTableNamesOffset = 0; int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t); - // Only write out all the fields except 'HashOffset', 'MemProfOffset', - // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the - // offset of these fields to allow back patching later. - for (int I = 0; I < N - 4; I++) + // Only write out the first four fields. We need to remember the offset of the + // remaining fields to allow back patching later. + for (int I = 0; I < 4; I++) OS.write(reinterpret_cast(&Header)[I]); // Save the location of Header.HashOffset field in \c OS. @@ -484,6 +484,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { uint64_t TemporalProfTracesOffset = OS.tell(); OS.write(0); + uint64_t VTableNamesOffset = OS.tell(); + OS.write(0); + // Reserve space to write profile summary data. uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size(); uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries); @@ -604,6 +607,31 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { OS.writeByte(0); } + uint64_t VTableNamesSectionStart = OS.tell(); + + // Use a dummy (and uncompressed) string as compressed vtable names and get + // the necessary profile format change in place for version 12. + // TODO: Store the list of vtable names in InstrProfWriter and use the + // real compressed name. + std::string CompressedVTableNames = "VTableNames"; + + uint64_t CompressedStringLen = CompressedVTableNames.length(); + + // Record the length of compressed string. + OS.write(CompressedStringLen); + + // Write the chars in compressed strings. + for (auto &c : CompressedVTableNames) + OS.writeByte(static_cast(c)); + + // Pad up to a multiple of 8. + // InstrProfReader would read bytes according to 'CompressedStringLen'. + uint64_t PaddedLength = alignTo(CompressedStringLen, 8); + + for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) { + OS.writeByte(0); + } + uint64_t TemporalProfTracesSectionStart = 0; if (static_cast(ProfileKind & InstrProfKind::TemporalProfile)) { TemporalProfTracesSectionStart = OS.tell(); @@ -647,6 +675,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { // Patch the Header.TemporalProfTracesOffset (=0 for profiles without // traces). {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1}, + {VTableNamesOffset, &VTableNamesSectionStart, 1}, // Patch the summary data. {SummaryOffset, reinterpret_cast(TheSummary.get()), (int)(SummarySize / sizeof(uint64_t))}, @@ -699,7 +728,8 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) { std::unique_ptr VD = Func.getValueForSite(VK, S); DenseSet SeenValues; for (uint32_t I = 0; I < ND; I++) - if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second) + if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) && + !SeenValues.insert(VD[I].Value).second) return make_error(instrprof_error::invalid_prof); } } @@ -747,7 +777,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash, OS << ND << "\n"; std::unique_ptr VD = Func.getValueForSite(VK, S); for (uint32_t I = 0; I < ND; I++) { - if (VK == IPVK_IndirectCallTarget) + if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget) OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":" << VD[I].Count << "\n"; else diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll index bbf895ea4b34e..08cbcaa962b76 100644 --- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll +++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll @@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu" @__profn_foo = private constant [3 x i8] c"foo" ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1 -; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64) -; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64), +; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64) +; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64), @__profn_bar = private constant [3 x i8] c"bar" ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1 -; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64) -; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64), +; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64) +; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64), ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names" ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames" diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw index 5efda10bb98a941c04b6846db05d3691bc36aac0..3daa98f937b691880ffff203c9426bfacddf749d 100644 GIT binary patch delta 133 zcmbQhvVeuNu_!ISs37M**F;W##f(djpGdFz|9^9xwDglu1`NP7F;ks2U=>hu;#6za s1Tf>OHE#ik0aQLiPe%I5WLZXI)&n4s$)Sw16~Kysa*R;Jz`Bw604`-Eq5uE@ delta 117 zcmZ3$GJ%D&u_!ISs37M*=R{6_L67IVA1SZ;|9^9yv+SKv1_s87mFlblGl86mORZTI rz>KHXyapf!P9f diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll index 8c6942c0f527b..1bad0db1b4762 100644 --- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll +++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll @@ -13,9 +13,9 @@ $foo = comdat any ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat ; CHECK-NOT: __profn__stdin__foo ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8 -; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null +; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null ; CHECK-NOT: @foo -; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8 +; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8 ; CHECK: @__llvm_prf_nm ; CHECK: @llvm.compiler.used diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw index 9cd225587c92511e99f3497ce1d5f47c6fc5f0af..a3e884343942ebc70ba95ab4ee006630b6816d80 100644 GIT binary patch delta 40 ycmV+@0N4NE5AY8OfpTVVa&T<_3Xus<4&W)M0UE0R|DByz{^eDZP6HaTaBv4~Q4!$) delta 39 vcmeys|A3#fu_!ISs37M*=R{6_K?|$bHJ=*(|Lz`(e%(w!XuMI#TR diff --git a/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw new file mode 100644 index 0000000000000000000000000000000000000000..84707ba2070a92b8683010d9daaef747df35f9ac GIT binary patch literal 528 zcmZoHO3N=Q$obF700xW@ih+Rz#(>i3d^BkWXQ;q~{}8~jee0hktN#DrJkOIkI+TF{ zX0YI^%?f`vOg;fr_5L!KFBeQb%shva5cM!VOdpINJ<~YH=c-N(O#cd~eK7d|0{XA2 zYFH&6%DWHJCbaDydjXpM1gQQWo?dWwGr?0yS0{Tm3_5AzQ$ z+Q7KtR(HRVzu%dYp1!6!$!AXbT=MqY*4O{3u}gA_;W2kfsb$ZfsH;9ZvV7_@)#;23 r{WSu+S$HaLo%TI*hM9pynsFJ}wH81UW(Uaqj8G0Nd|-00@P_dLvBrhT literal 0 HcmV?d00001 diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test index eda63203a304a..61881b69cfd5c 100644 --- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test +++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test @@ -10,10 +10,12 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) +// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) +// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes. // 2 * 8 binary ID sizes // + 2 * 20 binary IDs (of size 20) @@ -32,6 +34,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Binary IDs - There are only two in this case that are 20 bytes. RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test index 38b838e0d100a..316a9a4c9df4c 100644 --- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test +++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -12,6 +12,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Check for a corrupted size being too large past the end of the file. RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test index c967e850dbe35..8b686d5c50cb7 100644 --- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test +++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test @@ -10,10 +10,12 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) +// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) +// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Data Section // diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test index 2e747f81a6bfa..089afad420622 100644 --- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test +++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test @@ -10,10 +10,12 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) +// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) +// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Data Section // diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test index 3c23bc7dd0f7f..e404ba4210cc1 100644 --- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test +++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test @@ -10,10 +10,12 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) +// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) +// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -26,6 +28,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Data Section // diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test index 4a5c42843ff4d..ee54bfb978567 100644 --- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test +++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw // We should fail on this because the binary IDs is not a multiple of 8 bytes. RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -10,6 +10,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Binary IDs - There are only two in this case that are 20 bytes. RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test index 2a92575ee3407..dfa163f1f3439 100644 --- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test +++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test @@ -15,6 +15,8 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t RUN: printf '\0\0\0\1\0\4\0\0' >> %t RUN: printf '\0\0\0\2\0\4\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test index 8220361df6cfa..63782c8b94d4a 100644 --- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test +++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test @@ -1,5 +1,6 @@ +// Header RUN: printf '\377lprofR\201' > %t -RUN: printf '\0\0\0\0\0\0\0\11' >> %t +RUN: printf '\0\0\0\0\0\0\0\12' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\2' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,6 +13,8 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t RUN: printf '\0\0\0\0\3\0\0\0' >> %t RUN: printf '\0\0\0\0\2\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\134\370\302\114\333\030\275\254' >> %t RUN: printf '\0\0\0\0\0\0\0\1' >> %t @@ -20,9 +23,8 @@ RUN: printf '\3\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\3' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\344\023\165\112\031\035\265\067' >> %t RUN: printf '\0\0\0\0\0\0\0\2' >> %t @@ -31,9 +33,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\2' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\023' >> %t RUN: printf '\0\0\0\0\0\0\0\067' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test index 9352ae132380d..e9569bec1178b 100644 --- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test +++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test @@ -1,5 +1,5 @@ RUN: printf '\201Rforpl\377' > %t -RUN: printf '\11\0\0\0\0\0\0\0' >> %t +RUN: printf '\12\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\2\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t RUN: printf '\0\0\0\3\0\0\0\0' >> %t RUN: printf '\0\0\0\2\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\254\275\030\333\114\302\370\134' >> %t RUN: printf '\1\0\0\0\0\0\0\0' >> %t @@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\3\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\067\265\035\031\112\165\023\344' >> %t RUN: printf '\02\0\0\0\0\0\0\0' >> %t @@ -31,9 +32,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\2\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\023\0\0\0\0\0\0\0' >> %t RUN: printf '\067\0\0\0\0\0\0\0' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test index c3e995add6ff2..0bc579eec58ab 100644 --- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test +++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test @@ -1,5 +1,5 @@ RUN: printf '\377lprofr\201' > %t -RUN: printf '\0\0\0\0\0\0\0\11' >> %t +RUN: printf '\0\0\0\0\0\0\0\12' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\2' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t RUN: printf '\0\0\0\3\0\4\0\0' >> %t RUN: printf '\0\0\0\2\0\4\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\134\370\302\114\333\030\275\254' >> %t RUN: printf '\0\0\0\0\0\0\0\1' >> %t @@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\3' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\344\023\165\112\031\035\265\067' >> %t RUN: printf '\0\0\0\0\0\0\0\02' >> %t @@ -31,9 +32,8 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\02' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\023' >> %t RUN: printf '\0\0\0\0\0\0\0\067' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test index 0b3ef2a89abe5..ca9ea54c3f014 100644 --- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test +++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t -RUN: printf '\11\0\0\0\0\0\0\0' >> %t +RUN: printf '\12\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\2\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t RUN: printf '\0\0\4\0\3\0\0\0' >> %t RUN: printf '\0\0\4\0\2\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\254\275\030\333\114\302\370\134' >> %t RUN: printf '\1\0\0\0\0\0\0\0' >> %t @@ -20,9 +22,8 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\3\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\067\265\035\031\112\165\023\344' >> %t RUN: printf '\02\0\0\0\0\0\0\0' >> %t @@ -31,9 +32,8 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\02\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t +RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t -RUN: printf '\0\0\0\0' >> %t RUN: printf '\023\0\0\0\0\0\0\0' >> %t RUN: printf '\067\0\0\0\0\0\0\0' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test index f4a9aa8e1bbc3..70a4210dea9f8 100644 --- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test +++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t-foo.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw @@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw @@ -26,7 +28,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw RUN: printf '\201rforpl\377' > %t-bar.profraw -RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw +RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw @@ -39,6 +41,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw +RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw From 4d73cbe863886add6742a8ebd00d19c1cab11095 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 21 Feb 2024 21:10:47 -0800 Subject: [PATCH 181/351] [nfc]remove unused variable after pr/81691 (#82578) * `N` became unused after [pull request 81691](https://github.com/llvm/llvm-project/pull/81691) * This should fix the build bot failure of `unused variable` https://lab.llvm.org/buildbot/#/builders/77/builds/34840 --- llvm/lib/ProfileData/InstrProfWriter.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index e5163ebe8ae37..3e0a0e0d70116 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -456,7 +456,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { Header.BinaryIdOffset = 0; Header.TemporalProfTracesOffset = 0; Header.VTableNamesOffset = 0; - int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t); // Only write out the first four fields. We need to remember the offset of the // remaining fields to allow back patching later. From 0e8d1877cd145719b7acb707539287b7b877a555 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Wed, 21 Feb 2024 21:41:33 -0800 Subject: [PATCH 182/351] Revert type profiling change as compiler-rt test break on Windows. (#82583) Examples https://lab.llvm.org/buildbot/#/builders/127/builds/62532/steps/8/logs/stdio --- compiler-rt/include/profile/InstrProfData.inc | 50 +-------- compiler-rt/lib/profile/InstrProfiling.h | 35 ++----- .../lib/profile/InstrProfilingBuffer.c | 96 +++--------------- .../lib/profile/InstrProfilingInternal.h | 8 +- compiler-rt/lib/profile/InstrProfilingMerge.c | 23 +---- .../lib/profile/InstrProfilingPlatformLinux.c | 20 ---- .../lib/profile/InstrProfilingWriter.c | 37 ++----- .../profile/instrprof-write-buffer-internal.c | 6 +- llvm/include/llvm/ProfileData/InstrProf.h | 17 +--- .../llvm/ProfileData/InstrProfData.inc | 50 +-------- .../llvm/ProfileData/InstrProfReader.h | 13 --- llvm/lib/ProfileData/InstrProf.cpp | 11 +- llvm/lib/ProfileData/InstrProfReader.cpp | 44 +------- llvm/lib/ProfileData/InstrProfWriter.cpp | 43 ++------ .../InstrProfiling/coverage.ll | 8 +- .../thinlto_indirect_call_promotion.profraw | Bin 544 -> 528 bytes .../Transforms/PGOProfile/comdat_internal.ll | 4 +- .../llvm-profdata/Inputs/c-general.profraw | Bin 2032 -> 2016 bytes .../llvm-profdata/Inputs/compressed.profraw | Bin 1984 -> 1968 bytes .../thinlto_indirect_call_promotion.profraw | Bin 528 -> 0 bytes .../llvm-profdata/binary-ids-padding.test | 6 +- .../llvm-profdata/large-binary-id-size.test | 4 +- ...alformed-not-space-for-another-header.test | 6 +- .../malformed-num-counters-zero.test | 6 +- .../malformed-ptr-to-counter-array.test | 6 +- .../misaligned-binary-ids-size.test | 4 +- .../mismatched-raw-profile-header.test | 2 - .../tools/llvm-profdata/raw-32-bits-be.test | 11 +- .../tools/llvm-profdata/raw-32-bits-le.test | 10 +- .../tools/llvm-profdata/raw-64-bits-be.test | 10 +- .../tools/llvm-profdata/raw-64-bits-le.test | 10 +- .../tools/llvm-profdata/raw-two-profiles.test | 8 +- 32 files changed, 90 insertions(+), 458 deletions(-) delete mode 100644 llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index 1f77853bb8baa..c907a9736f316 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -96,25 +96,6 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \ #undef INSTR_PROF_DATA /* INSTR_PROF_DATA end. */ -/* For a virtual table object, record the name hash to associate profiled - * addresses with global variables, and record {starting address, size in bytes} - * to map the profiled virtual table (which usually have an offset from the - * starting address) back to a virtual table object. */ -#ifndef INSTR_PROF_VTABLE_DATA -#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) -#else -#define INSTR_PROF_VTABLE_DATA_DEFINED -#endif -INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \ - VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \ - IndexedInstrProf::ComputeHash(PGOVTableName))) -INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \ - VTablePointer, VTableAddr) -INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \ - ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \ - VTableSizeVal)) -#undef INSTR_PROF_VTABLE_DATA -/* INSTR_PROF_VTABLE_DATA end. */ /* This is an internal data structure used by value profiler. It * is defined here to allow serialization code sharing by LLVM @@ -166,8 +147,6 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta, (uintptr_t)BitmapBegin - (uintptr_t)DataBegin) INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) -INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) -INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) #undef INSTR_PROF_RAW_HEADER /* INSTR_PROF_RAW_HEADER end */ @@ -209,26 +188,13 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target") /* For memory intrinsic functions size profiling. */ VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size") -/* For virtual table address profiling, the address point of the virtual table - * (i.e., the address contained in objects pointing to a virtual table) are - * profiled. Note this may not be the address of the per C++ class virtual table - * object (e.g., there might be an offset). - * - * The profiled addresses are stored in raw profile, together with the following - * two types of information. - * 1. The (starting and ending) addresses of per C++ class virtual table objects. - * 2. The (compressed) virtual table object names. - * RawInstrProfReader converts profiled virtual table addresses to virtual table - * objects' MD5 hash. - */ -VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable") /* These two kinds must be the last to be * declared. This is to make sure the string * array created with the template can be * indexed with the kind value. */ VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first") -VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last") +VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last") #undef VALUE_PROF_KIND /* VALUE_PROF_KIND end */ @@ -318,18 +284,12 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \ INSTR_PROF_SECT_ENTRY(IPSK_name, \ INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \ INSTR_PROF_NAME_COFF, "__DATA,") -INSTR_PROF_SECT_ENTRY(IPSK_vname, \ - INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \ - INSTR_PROF_VNAME_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vals, \ INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \ INSTR_PROF_VALS_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \ INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \ INSTR_PROF_VNODES_COFF, "__DATA,") -INSTR_PROF_SECT_ENTRY(IPSK_vtab, \ - INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \ - INSTR_PROF_VTAB_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_covmap, \ INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \ INSTR_PROF_COVMAP_COFF, "__LLVM_COV,") @@ -708,9 +668,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129 /* Raw profile format version (start from 1). */ -#define INSTR_PROF_RAW_VERSION 10 +#define INSTR_PROF_RAW_VERSION 9 /* Indexed profile format version (start from 1). */ -#define INSTR_PROF_INDEX_VERSION 12 +#define INSTR_PROF_INDEX_VERSION 11 /* Coverage mapping format version (start from 0). */ #define INSTR_PROF_COVMAP_VERSION 6 @@ -748,12 +708,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data #define INSTR_PROF_NAME_COMMON __llvm_prf_names -#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts #define INSTR_PROF_BITS_COMMON __llvm_prf_bits #define INSTR_PROF_VALS_COMMON __llvm_prf_vals #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds -#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab #define INSTR_PROF_COVMAP_COMMON __llvm_covmap #define INSTR_PROF_COVFUN_COMMON __llvm_covfun #define INSTR_PROF_COVDATA_COMMON __llvm_covdata @@ -764,12 +722,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, */ #define INSTR_PROF_DATA_COFF ".lprfd$M" #define INSTR_PROF_NAME_COFF ".lprfn$M" -#define INSTR_PROF_VNAME_COFF ".lprfvn$M" #define INSTR_PROF_CNTS_COFF ".lprfc$M" #define INSTR_PROF_BITS_COFF ".lprfb$M" #define INSTR_PROF_VALS_COFF ".lprfv$M" #define INSTR_PROF_VNODES_COFF ".lprfnd$M" -#define INSTR_PROF_VTAB_COFF ".lprfvt$M" #define INSTR_PROF_COVMAP_COFF ".lcovmap$M" #define INSTR_PROF_COVFUN_COFF ".lcovfun$M" /* Since cov data and cov names sections are not allocated, we don't need to diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index be694a8d3330b..0123908336918 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -49,12 +49,6 @@ typedef struct ValueProfNode { #include "profile/InstrProfData.inc" } ValueProfNode; -typedef void *IntPtrT; -typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData { -#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name; -#include "profile/InstrProfData.inc" -} VTableProfData; - /*! * \brief Return 1 if profile counters are continuously synced to the raw * profile via an mmap(). This is in contrast to the default mode, in which @@ -109,16 +103,12 @@ const __llvm_profile_data *__llvm_profile_begin_data(void); const __llvm_profile_data *__llvm_profile_end_data(void); const char *__llvm_profile_begin_names(void); const char *__llvm_profile_end_names(void); -const char *__llvm_profile_begin_vtabnames(void); -const char *__llvm_profile_end_vtabnames(void); char *__llvm_profile_begin_counters(void); char *__llvm_profile_end_counters(void); char *__llvm_profile_begin_bitmap(void); char *__llvm_profile_end_bitmap(void); ValueProfNode *__llvm_profile_begin_vnodes(); ValueProfNode *__llvm_profile_end_vnodes(); -VTableProfData *__llvm_profile_begin_vtables(); -VTableProfData *__llvm_profile_end_vtables(); uint32_t *__llvm_profile_begin_orderfile(); /*! @@ -262,31 +252,20 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin, /*! \brief Get the size of the profile name section in bytes. */ uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End); -/*! \brief Get the number of virtual table profile data entries */ -uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, - const VTableProfData *End); - -/*! \brief Get the size of virtual table profile data in bytes. */ -uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin, - const VTableProfData *End); - -/* ! \brief Given the sizes of the data and counter information, computes the - * number of padding bytes before and after the counter section, as well as the - * number of padding bytes after other setions in the raw profile. - * Returns -1 upon errors and 0 upon success. Output parameters should be used - * iff return value is 0. +/* ! \brief Given the sizes of the data and counter information, return the + * number of padding bytes before and after the counters, and after the names, + * in the raw profile. * * Note: When mmap() mode is disabled, no padding bytes before/after counters * are needed. However, in mmap() mode, the counter section in the raw profile * must be page-aligned: this API computes the number of padding bytes * needed to achieve that. */ -int __llvm_profile_get_padding_sizes_for_counters( +void __llvm_profile_get_padding_sizes_for_counters( uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes, - uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize, - uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters, - uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames, - uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames); + uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters, + uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap, + uint64_t *PaddingBytesAfterNames); /*! * \brief Set the flag that profile data has been dumped to the file. diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c index 7c5c26f4d113b..af52804b2b532 100644 --- a/compiler-rt/lib/profile/InstrProfilingBuffer.c +++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c @@ -51,29 +51,16 @@ uint64_t __llvm_profile_get_size_for_buffer(void) { const char *BitmapEnd = __llvm_profile_end_bitmap(); const char *NamesBegin = __llvm_profile_begin_names(); const char *NamesEnd = __llvm_profile_end_names(); - const VTableProfData *VTableBegin = __llvm_profile_begin_vtables(); - const VTableProfData *VTableEnd = __llvm_profile_end_vtables(); - const char *VNamesBegin = __llvm_profile_begin_vtabnames(); - const char *VNamesEnd = __llvm_profile_end_vtabnames(); return __llvm_profile_get_size_for_buffer_internal( DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd, - NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd); + NamesBegin, NamesEnd); } COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin, const __llvm_profile_data *End) { intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End; - // `sizeof(__llvm_profile_data) - 1` is required in the numerator when - // [Begin, End] represents an inclusive range. - // For ELF, [Begin, End) represents the address of linker-inserted - // symbols `__start__` and `__stop_`. - // Thereby, `End` is one byte past the inclusive range, and - // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get - // the correct number of profile data. - // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true - // across platforms. return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) / sizeof(__llvm_profile_data); } @@ -84,26 +71,6 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin, return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data); } -// Counts the number of `VTableProfData` elements within the range of [Begin, -// End). Caller should guarantee that End points to one byte past the inclusive -// range. -// FIXME: Add a compiler-rt test to make sure the number of vtables in the -// raw profile is the same as the number of vtable elements in the instrumented -// binary. -COMPILER_RT_VISIBILITY -uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin, - const VTableProfData *End) { - // Convert pointers to intptr_t to use integer arithmetic. - intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin; - return (EndI - BeginI) / sizeof(VTableProfData); -} - -COMPILER_RT_VISIBILITY -uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin, - const VTableProfData *End) { - return (intptr_t)(End) - (intptr_t)(Begin); -} - COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) { if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE) return sizeof(uint8_t); @@ -152,13 +119,11 @@ static int needsCounterPadding(void) { } COMPILER_RT_VISIBILITY -int __llvm_profile_get_padding_sizes_for_counters( +void __llvm_profile_get_padding_sizes_for_counters( uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes, - uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize, - uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters, - uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames, - uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) { - // Counter padding is needed only if continuous mode is enabled. + uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters, + uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes, + uint64_t *PaddingBytesAfterNames) { if (!needsCounterPadding()) { *PaddingBytesBeforeCounters = 0; *PaddingBytesAfterCounters = @@ -166,19 +131,9 @@ int __llvm_profile_get_padding_sizes_for_counters( *PaddingBytesAfterBitmapBytes = __llvm_profile_get_num_padding_bytes(NumBitmapBytes); *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize); - if (PaddingBytesAfterVTable != NULL) - *PaddingBytesAfterVTable = - __llvm_profile_get_num_padding_bytes(VTableSize); - if (PaddingBytesAfterVName != NULL) - *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize); - return 0; + return; } - // Value profiling not supported in continuous mode at profile-write time. - // Return -1 to alert the incompatibility. - if (VTableSize != 0 || VNameSize != 0) - return -1; - // In continuous mode, the file offsets for headers and for the start of // counter sections need to be page-aligned. *PaddingBytesBeforeCounters = @@ -187,22 +142,13 @@ int __llvm_profile_get_padding_sizes_for_counters( *PaddingBytesAfterBitmapBytes = calculateBytesNeededToPageAlign(NumBitmapBytes); *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize); - // Set these two variables to zero to avoid uninitialized variables - // even if VTableSize and VNameSize are known to be zero. - if (PaddingBytesAfterVTable != NULL) - *PaddingBytesAfterVTable = 0; - if (PaddingBytesAfterVName != NULL) - *PaddingBytesAfterVName = 0; - return 0; } COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_size_for_buffer_internal( const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, - const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd, - const VTableProfData *VTableBegin, const VTableProfData *VTableEnd, - const char *VNamesBegin, const char *VNamesEnd) { + const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) { /* Match logic in __llvm_profile_write_buffer(). */ const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char); uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd); @@ -210,29 +156,20 @@ uint64_t __llvm_profile_get_size_for_buffer_internal( __llvm_profile_get_counters_size(CountersBegin, CountersEnd); const uint64_t NumBitmapBytes = __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd); - const uint64_t VTableSize = - __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd); - const uint64_t VNameSize = - __llvm_profile_get_name_size(VNamesBegin, VNamesEnd); /* Determine how much padding is needed before/after the counters and after * the names. */ uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters, - PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes, - PaddingBytesAfterVTable, PaddingBytesAfterVNames; + PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes; __llvm_profile_get_padding_sizes_for_counters( - DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */, - 0 /* VNameSize */, &PaddingBytesBeforeCounters, - &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes, - &PaddingBytesAfterNames, &PaddingBytesAfterVTable, - &PaddingBytesAfterVNames); + DataSize, CountersSize, NumBitmapBytes, NamesSize, + &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters, + &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames); return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) + DataSize + PaddingBytesBeforeCounters + CountersSize + PaddingBytesAfterCounters + NumBitmapBytes + - PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames + - VTableSize + PaddingBytesAfterVTable + VNameSize + - PaddingBytesAfterVNames; + PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames; } COMPILER_RT_VISIBILITY @@ -254,10 +191,7 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal( const char *NamesBegin, const char *NamesEnd) { ProfDataWriter BufferWriter; initBufferWriter(&BufferWriter, Buffer); - // Set virtual table arguments to NULL since they are not supported yet. - return lprofWriteDataImpl( - &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, - BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd, - /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL, - /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0); + return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin, + CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin, + NamesEnd, 0); } diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h index d5bd0e41fb129..03ed67fcfa766 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.h +++ b/compiler-rt/lib/profile/InstrProfilingInternal.h @@ -22,9 +22,7 @@ uint64_t __llvm_profile_get_size_for_buffer_internal( const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, - const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd, - const VTableProfData *VTableBegin, const VTableProfData *VTableEnd, - const char *VNamesBegin, const char *VNamesEnd); + const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd); /*! * \brief Write instrumentation data to the given buffer, given explicit @@ -158,9 +156,7 @@ int lprofWriteDataImpl(ProfDataWriter *Writer, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, VPDataReaderType *VPDataReader, const char *NamesBegin, - const char *NamesEnd, const VTableProfData *VTableBegin, - const VTableProfData *VTableEnd, const char *VNamesBegin, - const char *VNamesEnd, int SkipNameDataWrite); + const char *NamesEnd, int SkipNameDataWrite); /* Merge value profile data pointed to by SrcValueProfData into * in-memory profile counters pointed by to DstData. */ diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c index c0706b73e1668..b5850e99ee37d 100644 --- a/compiler-rt/lib/profile/InstrProfilingMerge.c +++ b/compiler-rt/lib/profile/InstrProfilingMerge.c @@ -107,26 +107,6 @@ static uintptr_t signextIfWin64(void *V) { #endif } -// Skip names section, vtable profile data section and vtable names section -// for runtime profile merge. To merge runtime addresses from multiple -// profiles collected from the same instrumented binary, the binary should be -// loaded at fixed base address (e.g., build with -no-pie, or run with ASLR -// disabled). In this set-up these three sections remain unchanged. -static uint64_t -getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) { - const uint64_t VTableSectionSize = - Header->NumVTables * sizeof(VTableProfData); - const uint64_t PaddingBytesAfterVTableSection = - __llvm_profile_get_num_padding_bytes(VTableSectionSize); - const uint64_t VNamesSize = Header->VNamesSize; - const uint64_t PaddingBytesAfterVNamesSize = - __llvm_profile_get_num_padding_bytes(VNamesSize); - return Header->NamesSize + - __llvm_profile_get_num_padding_bytes(Header->NamesSize) + - VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize + - PaddingBytesAfterVNamesSize; -} - COMPILER_RT_VISIBILITY int __llvm_profile_merge_from_buffer(const char *ProfileData, uint64_t ProfileSize) { @@ -157,7 +137,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData, SrcBitmapStart = SrcCountersEnd; SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes; SrcValueProfDataStart = - SrcNameStart + getDistanceFromCounterToValueProf(Header); + SrcNameStart + Header->NamesSize + + __llvm_profile_get_num_padding_bytes(Header->NamesSize); if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart) return 1; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index d2554a2702aaf..19266ab6c6fb8 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -24,12 +24,8 @@ #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON) #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON) #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON) -#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON) -#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON) #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON) #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON) -#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON) -#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON) #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON) #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON) #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON) @@ -45,10 +41,6 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; -extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; -extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; -extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; -extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; @@ -71,18 +63,6 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) { COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) { return &PROF_NAME_STOP; } -COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) { - return &PROF_VNAME_START; -} -COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) { - return &PROF_VNAME_STOP; -} -COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_begin_vtables(void) { - return &PROF_VTABLE_START; -} -COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_end_vtables(void) { - return &PROF_VTABLE_STOP; -} COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) { return &PROF_CNTS_START; } diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c index 8816a71155511..4d767d1385148 100644 --- a/compiler-rt/lib/profile/InstrProfilingWriter.c +++ b/compiler-rt/lib/profile/InstrProfilingWriter.c @@ -250,14 +250,9 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer, const char *BitmapEnd = __llvm_profile_end_bitmap(); const char *NamesBegin = __llvm_profile_begin_names(); const char *NamesEnd = __llvm_profile_end_names(); - const VTableProfData *VTableBegin = __llvm_profile_begin_vtables(); - const VTableProfData *VTableEnd = __llvm_profile_end_vtables(); - const char *VNamesBegin = __llvm_profile_begin_vtabnames(); - const char *VNamesEnd = __llvm_profile_end_vtabnames(); return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd, VPDataReader, - NamesBegin, NamesEnd, VTableBegin, VTableEnd, - VNamesBegin, VNamesEnd, SkipNameDataWrite); + NamesBegin, NamesEnd, SkipNameDataWrite); } COMPILER_RT_VISIBILITY int @@ -266,9 +261,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, VPDataReaderType *VPDataReader, const char *NamesBegin, - const char *NamesEnd, const VTableProfData *VTableBegin, - const VTableProfData *VTableEnd, const char *VNamesBegin, - const char *VNamesEnd, int SkipNameDataWrite) { + const char *NamesEnd, int SkipNameDataWrite) { /* Calculate size of sections. */ const uint64_t DataSectionSize = __llvm_profile_get_data_size(DataBegin, DataEnd); @@ -280,12 +273,6 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, const uint64_t NumBitmapBytes = __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd); const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd); - const uint64_t NumVTables = - __llvm_profile_get_num_vtable(VTableBegin, VTableEnd); - const uint64_t VTableSectionSize = - __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd); - const uint64_t VNamesSize = - __llvm_profile_get_name_size(VNamesBegin, VNamesEnd); /* Create the header. */ __llvm_profile_header Header; @@ -293,15 +280,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, /* Determine how much padding is needed before/after the counters and after * the names. */ uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters, - PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames, - PaddingBytesAfterVTable, PaddingBytesAfterVNames; - if (__llvm_profile_get_padding_sizes_for_counters( - DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize, - VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters, - &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes, - &PaddingBytesAfterNames, &PaddingBytesAfterVTable, - &PaddingBytesAfterVNames) == -1) - return -1; + PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes; + __llvm_profile_get_padding_sizes_for_counters( + DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize, + &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters, + &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames); { /* Initialize header structure. */ @@ -340,11 +323,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0}, {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1}, {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0}, - {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}, - {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0}, - {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1}, - {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0}, - {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}}; + {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}}; if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData))) return -1; diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c index 2c1c29ac0c588..d9670f739ca98 100644 --- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c +++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c @@ -31,8 +31,7 @@ char *__llvm_profile_end_bitmap(void); uint64_t __llvm_profile_get_size_for_buffer_internal( const void *DataBegin, const void *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, - const char *NamesBegin, const char *NamesEnd, const void *VTableBegin, - const void *VTableEnd, const char *VNamesBegin, const char *VNamesEnd); + const char *NamesBegin, const char *NamesEnd); int __llvm_profile_write_buffer_internal( char *Buffer, const void *DataBegin, const void *DataEnd, @@ -46,8 +45,7 @@ int main(int argc, const char *argv[]) { __llvm_profile_begin_data(), __llvm_profile_end_data(), __llvm_profile_begin_counters(), __llvm_profile_end_counters(), __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(), - __llvm_profile_begin_names(), __llvm_profile_end_names(), NULL, NULL, - NULL, NULL); + __llvm_profile_begin_names(), __llvm_profile_end_names()); char *buf = malloc(bufsize); int ret = __llvm_profile_write_buffer_internal( diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 25ec06a739202..a928ba6961f36 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -831,7 +831,6 @@ struct InstrProfRecord { struct ValueProfData { std::vector IndirectCallSites; std::vector MemOPSizes; - std::vector VTableTargets; }; std::unique_ptr ValueData; @@ -854,8 +853,6 @@ struct InstrProfRecord { return ValueData->IndirectCallSites; case IPVK_MemOPSize: return ValueData->MemOPSizes; - case IPVK_VTableTarget: - return ValueData->VTableTargets; default: llvm_unreachable("Unknown value kind!"); } @@ -1039,9 +1036,7 @@ enum ProfVersion { Version10 = 10, // An additional field is used for bitmap bytes. Version11 = 11, - // VTable profiling, - Version12 = 12, - // The current version is 12. + // The current version is 11. CurrentVersion = INSTR_PROF_INDEX_VERSION }; const uint64_t Version = ProfVersion::CurrentVersion; @@ -1062,7 +1057,6 @@ struct Header { uint64_t MemProfOffset; uint64_t BinaryIdOffset; uint64_t TemporalProfTracesOffset; - uint64_t VTableNamesOffset; // New fields should only be added at the end to ensure that the size // computation is correct. The methods below need to be updated to ensure that // the new field is read correctly. @@ -1199,13 +1193,8 @@ template <> inline uint64_t getMagic() { // It should also match the synthesized type in // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters. template struct alignas(8) ProfileData { -#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name; -#include "llvm/ProfileData/InstrProfData.inc" -}; - -template struct alignas(8) VTableProfileData { -#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name; -#include "llvm/ProfileData/InstrProfData.inc" + #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name; + #include "llvm/ProfileData/InstrProfData.inc" }; // File header structure of the LLVM profile data in raw format. diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 1f77853bb8baa..c907a9736f316 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -96,25 +96,6 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \ #undef INSTR_PROF_DATA /* INSTR_PROF_DATA end. */ -/* For a virtual table object, record the name hash to associate profiled - * addresses with global variables, and record {starting address, size in bytes} - * to map the profiled virtual table (which usually have an offset from the - * starting address) back to a virtual table object. */ -#ifndef INSTR_PROF_VTABLE_DATA -#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) -#else -#define INSTR_PROF_VTABLE_DATA_DEFINED -#endif -INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \ - VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \ - IndexedInstrProf::ComputeHash(PGOVTableName))) -INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \ - VTablePointer, VTableAddr) -INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \ - ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \ - VTableSizeVal)) -#undef INSTR_PROF_VTABLE_DATA -/* INSTR_PROF_VTABLE_DATA end. */ /* This is an internal data structure used by value profiler. It * is defined here to allow serialization code sharing by LLVM @@ -166,8 +147,6 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta, (uintptr_t)BitmapBegin - (uintptr_t)DataBegin) INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) -INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) -INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) #undef INSTR_PROF_RAW_HEADER /* INSTR_PROF_RAW_HEADER end */ @@ -209,26 +188,13 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target") /* For memory intrinsic functions size profiling. */ VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size") -/* For virtual table address profiling, the address point of the virtual table - * (i.e., the address contained in objects pointing to a virtual table) are - * profiled. Note this may not be the address of the per C++ class virtual table - * object (e.g., there might be an offset). - * - * The profiled addresses are stored in raw profile, together with the following - * two types of information. - * 1. The (starting and ending) addresses of per C++ class virtual table objects. - * 2. The (compressed) virtual table object names. - * RawInstrProfReader converts profiled virtual table addresses to virtual table - * objects' MD5 hash. - */ -VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable") /* These two kinds must be the last to be * declared. This is to make sure the string * array created with the template can be * indexed with the kind value. */ VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first") -VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last") +VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last") #undef VALUE_PROF_KIND /* VALUE_PROF_KIND end */ @@ -318,18 +284,12 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \ INSTR_PROF_SECT_ENTRY(IPSK_name, \ INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \ INSTR_PROF_NAME_COFF, "__DATA,") -INSTR_PROF_SECT_ENTRY(IPSK_vname, \ - INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \ - INSTR_PROF_VNAME_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vals, \ INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \ INSTR_PROF_VALS_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \ INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \ INSTR_PROF_VNODES_COFF, "__DATA,") -INSTR_PROF_SECT_ENTRY(IPSK_vtab, \ - INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \ - INSTR_PROF_VTAB_COFF, "__DATA,") INSTR_PROF_SECT_ENTRY(IPSK_covmap, \ INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \ INSTR_PROF_COVMAP_COFF, "__LLVM_COV,") @@ -708,9 +668,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129 /* Raw profile format version (start from 1). */ -#define INSTR_PROF_RAW_VERSION 10 +#define INSTR_PROF_RAW_VERSION 9 /* Indexed profile format version (start from 1). */ -#define INSTR_PROF_INDEX_VERSION 12 +#define INSTR_PROF_INDEX_VERSION 11 /* Coverage mapping format version (start from 0). */ #define INSTR_PROF_COVMAP_VERSION 6 @@ -748,12 +708,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data #define INSTR_PROF_NAME_COMMON __llvm_prf_names -#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts #define INSTR_PROF_BITS_COMMON __llvm_prf_bits #define INSTR_PROF_VALS_COMMON __llvm_prf_vals #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds -#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab #define INSTR_PROF_COVMAP_COMMON __llvm_covmap #define INSTR_PROF_COVFUN_COMMON __llvm_covfun #define INSTR_PROF_COVDATA_COMMON __llvm_covdata @@ -764,12 +722,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, */ #define INSTR_PROF_DATA_COFF ".lprfd$M" #define INSTR_PROF_NAME_COFF ".lprfn$M" -#define INSTR_PROF_VNAME_COFF ".lprfvn$M" #define INSTR_PROF_CNTS_COFF ".lprfc$M" #define INSTR_PROF_BITS_COFF ".lprfb$M" #define INSTR_PROF_VALS_COFF ".lprfv$M" #define INSTR_PROF_VNODES_COFF ".lprfnd$M" -#define INSTR_PROF_VTAB_COFF ".lprfvt$M" #define INSTR_PROF_COVMAP_COFF ".lcovmap$M" #define INSTR_PROF_COVFUN_COFF ".lcovfun$M" /* Since cov data and cov names sections are not allocated, we don't need to diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h index cfde5d3fc77d6..87f15639a2c3c 100644 --- a/llvm/include/llvm/ProfileData/InstrProfReader.h +++ b/llvm/include/llvm/ProfileData/InstrProfReader.h @@ -326,16 +326,12 @@ class RawInstrProfReader : public InstrProfReader { uint64_t NamesDelta; const RawInstrProf::ProfileData *Data; const RawInstrProf::ProfileData *DataEnd; - const RawInstrProf::VTableProfileData *VTableBegin = nullptr; - const RawInstrProf::VTableProfileData *VTableEnd = nullptr; const char *CountersStart; const char *CountersEnd; const char *BitmapStart; const char *BitmapEnd; const char *NamesStart; const char *NamesEnd; - const char *VNamesStart = nullptr; - const char *VNamesEnd = nullptr; // After value profile is all read, this pointer points to // the header of next profile data (if exists) const uint8_t *ValueDataStart; @@ -660,15 +656,6 @@ class IndexedInstrProfReader : public InstrProfReader { std::unique_ptr MemProfRecordTable; /// MemProf frame profile data on-disk indexed via frame id. std::unique_ptr MemProfFrameTable; - /// VTableNamePtr points to the beginning of compressed vtable names. - /// When a symtab is constructed from profiles by llvm-profdata, the list of - /// names could be decompressed based on `VTableNamePtr` and - /// `CompressedVTableNamesLen`. - /// A compiler that reads indexed profiles could construct symtab from module - /// IR so it doesn't need the decompressed names. - const char *VTableNamePtr = nullptr; - /// The length of compressed vtable names. - uint64_t CompressedVTableNamesLen = 0; /// Total size of binary ids. uint64_t BinaryIdsSize{0}; /// Start address of binary id length and data pairs. diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index b9afee413853e..2eeeff987399d 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -1533,12 +1533,9 @@ Expected
Header::readFromBuffer(const unsigned char *Buffer) { // When a new field is added in the header add a case statement here to // populate it. static_assert( - IndexedInstrProf::ProfVersion::CurrentVersion == Version12, + IndexedInstrProf::ProfVersion::CurrentVersion == Version11, "Please update the reading code below if a new field has been added, " "if not add a case statement to fall through to the latest version."); - case 12ull: - H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset)); - [[fallthrough]]; case 11ull: [[fallthrough]]; case 10ull: @@ -1564,14 +1561,10 @@ size_t Header::size() const { // When a new field is added to the header add a case statement here to // compute the size as offset of the new field + size of the new field. This // relies on the field being added to the end of the list. - static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12, + static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11, "Please update the size computation below if a new field has " "been added to the header, if not add a case statement to " "fall through to the latest version."); - case 12ull: - return offsetOf(&Header::VTableNamesOffset) + - sizeof(Header::VTableNamesOffset); - [[fallthrough]]; case 11ull: [[fallthrough]]; case 10ull: diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 31b742bca14d6..0d8d43daae960 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -366,11 +366,6 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) { return E; Value = IndexedInstrProf::ComputeHash(VD.first); } - } else if (ValueKind == IPVK_VTableTarget) { - if (InstrProfSymtab::isExternalSymbol(VD.first)) - Value = 0; - else - Value = IndexedInstrProf::ComputeHash(VD.first); } else { READ_NUM(VD.first, Value); } @@ -587,17 +582,10 @@ Error RawInstrProfReader::readHeader( auto NumBitmapBytes = swap(Header.NumBitmapBytes); auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes); auto NamesSize = swap(Header.NamesSize); - auto VTableNameSize = swap(Header.VNamesSize); - auto NumVTables = swap(Header.NumVTables); ValueKindLast = swap(Header.ValueKindLast); auto DataSize = NumData * sizeof(RawInstrProf::ProfileData); - auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize); - auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize); - - auto VTableSectionSize = - NumVTables * sizeof(RawInstrProf::VTableProfileData); - auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize); + auto PaddingSize = getNumPaddingBytes(NamesSize); // Profile data starts after profile header and binary ids if exist. ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize; @@ -606,12 +594,7 @@ Error RawInstrProfReader::readHeader( CountersOffset + CountersSize + PaddingBytesAfterCounters; ptrdiff_t NamesOffset = BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes; - ptrdiff_t VTableProfDataOffset = - NamesOffset + NamesSize + PaddingBytesAfterNames; - ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize + - PaddingBytesAfterVTableProfData; - ptrdiff_t ValueDataOffset = - VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames; + ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize; auto *Start = reinterpret_cast(&Header); if (Start + ValueDataOffset > DataBuffer->getBufferEnd()) @@ -631,14 +614,8 @@ Error RawInstrProfReader::readHeader( Data = reinterpret_cast *>( Start + DataOffset); DataEnd = Data + NumData; - VTableBegin = - reinterpret_cast *>( - Start + VTableProfDataOffset); - VTableEnd = VTableBegin + NumVTables; NamesStart = Start + NamesOffset; NamesEnd = NamesStart + NamesSize; - VNamesStart = Start + VTableNameOffset; - VNamesEnd = VNamesStart + VTableNameSize; } CountersStart = Start + CountersOffset; @@ -1283,23 +1260,6 @@ Error IndexedInstrProfReader::readHeader() { "corrupted binary ids"); } - if (GET_VERSION(Header->formatVersion()) >= 12) { - uint64_t VTableNamesOffset = - endian::byte_swap( - Header->VTableNamesOffset); - const unsigned char *Ptr = Start + VTableNamesOffset; - - CompressedVTableNamesLen = - support::endian::readNext(Ptr); - - // Writer first writes the length of compressed string, and then the actual - // content. - VTableNamePtr = (const char *)Ptr; - if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd()) - return make_error(instrprof_error::truncated); - } - if (GET_VERSION(Header->formatVersion()) >= 10 && Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) { uint64_t TemporalProfTracesOffset = diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 3e0a0e0d70116..d65f8fe50313d 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -455,11 +455,12 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { Header.MemProfOffset = 0; Header.BinaryIdOffset = 0; Header.TemporalProfTracesOffset = 0; - Header.VTableNamesOffset = 0; + int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t); - // Only write out the first four fields. We need to remember the offset of the - // remaining fields to allow back patching later. - for (int I = 0; I < 4; I++) + // Only write out all the fields except 'HashOffset', 'MemProfOffset', + // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the + // offset of these fields to allow back patching later. + for (int I = 0; I < N - 4; I++) OS.write(reinterpret_cast(&Header)[I]); // Save the location of Header.HashOffset field in \c OS. @@ -483,9 +484,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { uint64_t TemporalProfTracesOffset = OS.tell(); OS.write(0); - uint64_t VTableNamesOffset = OS.tell(); - OS.write(0); - // Reserve space to write profile summary data. uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size(); uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries); @@ -606,31 +604,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { OS.writeByte(0); } - uint64_t VTableNamesSectionStart = OS.tell(); - - // Use a dummy (and uncompressed) string as compressed vtable names and get - // the necessary profile format change in place for version 12. - // TODO: Store the list of vtable names in InstrProfWriter and use the - // real compressed name. - std::string CompressedVTableNames = "VTableNames"; - - uint64_t CompressedStringLen = CompressedVTableNames.length(); - - // Record the length of compressed string. - OS.write(CompressedStringLen); - - // Write the chars in compressed strings. - for (auto &c : CompressedVTableNames) - OS.writeByte(static_cast(c)); - - // Pad up to a multiple of 8. - // InstrProfReader would read bytes according to 'CompressedStringLen'. - uint64_t PaddedLength = alignTo(CompressedStringLen, 8); - - for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) { - OS.writeByte(0); - } - uint64_t TemporalProfTracesSectionStart = 0; if (static_cast(ProfileKind & InstrProfKind::TemporalProfile)) { TemporalProfTracesSectionStart = OS.tell(); @@ -674,7 +647,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) { // Patch the Header.TemporalProfTracesOffset (=0 for profiles without // traces). {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1}, - {VTableNamesOffset, &VTableNamesSectionStart, 1}, // Patch the summary data. {SummaryOffset, reinterpret_cast(TheSummary.get()), (int)(SummarySize / sizeof(uint64_t))}, @@ -727,8 +699,7 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) { std::unique_ptr VD = Func.getValueForSite(VK, S); DenseSet SeenValues; for (uint32_t I = 0; I < ND; I++) - if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) && - !SeenValues.insert(VD[I].Value).second) + if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second) return make_error(instrprof_error::invalid_prof); } } @@ -776,7 +747,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash, OS << ND << "\n"; std::unique_ptr VD = Func.getValueForSite(VK, S); for (uint32_t I = 0; I < ND; I++) { - if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget) + if (VK == IPVK_IndirectCallTarget) OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":" << VD[I].Count << "\n"; else diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll index 08cbcaa962b76..bbf895ea4b34e 100644 --- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll +++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll @@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu" @__profn_foo = private constant [3 x i8] c"foo" ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1 -; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64) -; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64), +; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64) +; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64), @__profn_bar = private constant [3 x i8] c"bar" ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1 -; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64) -; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64), +; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64) +; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64), ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names" ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames" diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw index 3daa98f937b691880ffff203c9426bfacddf749d..5efda10bb98a941c04b6846db05d3691bc36aac0 100644 GIT binary patch delta 117 zcmZ3$GJ%D&u_!ISs37M*=R{6_L67IVA1SZ;|9^9yv+SKv1_s87mFlblGl86mORZTI rz>KHXyapf!P9f delta 133 zcmbQhvVeuNu_!ISs37M**F;W##f(djpGdFz|9^9xwDglu1`NP7F;ks2U=>hu;#6za s1Tf>OHE#ik0aQLiPe%I5WLZXI)&n4s$)Sw16~Kysa*R;Jz`Bw604`-Eq5uE@ diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll index 1bad0db1b4762..8c6942c0f527b 100644 --- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll +++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll @@ -13,9 +13,9 @@ $foo = comdat any ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat ; CHECK-NOT: __profn__stdin__foo ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8 -; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null +; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null ; CHECK-NOT: @foo -; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8 +; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8 ; CHECK: @__llvm_prf_nm ; CHECK: @llvm.compiler.used diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw index a3e884343942ebc70ba95ab4ee006630b6816d80..9cd225587c92511e99f3497ce1d5f47c6fc5f0af 100644 GIT binary patch delta 39 vcmeys|A3#fu_!ISs37M*=R{6_K?|$bHJ=*(|Lz`(e%(w!XuMI#TR delta 40 wcmdnMe}JE}u_!ISs37M**F;W#z6X_m+ZL?)|9`G8Q)PVUWItx9jRg+u0BX?@Q2+n{ diff --git a/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw deleted file mode 100644 index 84707ba2070a92b8683010d9daaef747df35f9ac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 528 zcmZoHO3N=Q$obF700xW@ih+Rz#(>i3d^BkWXQ;q~{}8~jee0hktN#DrJkOIkI+TF{ zX0YI^%?f`vOg;fr_5L!KFBeQb%shva5cM!VOdpINJ<~YH=c-N(O#cd~eK7d|0{XA2 zYFH&6%DWHJCbaDydjXpM1gQQWo?dWwGr?0yS0{Tm3_5AzQ$ z+Q7KtR(HRVzu%dYp1!6!$!AXbT=MqY*4O{3u}gA_;W2kfsb$ZfsH;9ZvV7_@)#;23 r{WSu+S$HaLo%TI*hM9pynsFJ}wH81UW(Uaqj8G0Nd|-00@P_dLvBrhT diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test index 61881b69cfd5c..eda63203a304a 100644 --- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test +++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test @@ -10,12 +10,10 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) -// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) -// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes. // 2 * 8 binary ID sizes // + 2 * 20 binary IDs (of size 20) @@ -34,8 +32,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Binary IDs - There are only two in this case that are 20 bytes. RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test index 316a9a4c9df4c..38b838e0d100a 100644 --- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test +++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -12,8 +12,6 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Check for a corrupted size being too large past the end of the file. RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test index 8b686d5c50cb7..c967e850dbe35 100644 --- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test +++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test @@ -10,12 +10,10 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) -// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) -// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -28,8 +26,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Data Section // diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test index 089afad420622..2e747f81a6bfa 100644 --- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test +++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test @@ -10,12 +10,10 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) -// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) -// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -28,8 +26,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Data Section // diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test index e404ba4210cc1..3c23bc7dd0f7f 100644 --- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test +++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test @@ -10,12 +10,10 @@ // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin) // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin) // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin) -// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize) -// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables) // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last) RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -28,8 +26,6 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Data Section // diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test index ee54bfb978567..4a5c42843ff4d 100644 --- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test +++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw // We should fail on this because the binary IDs is not a multiple of 8 bytes. RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw @@ -10,8 +10,6 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw // Binary IDs - There are only two in this case that are 20 bytes. RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test index dfa163f1f3439..2a92575ee3407 100644 --- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test +++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test @@ -15,8 +15,6 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t RUN: printf '\0\0\0\1\0\4\0\0' >> %t RUN: printf '\0\0\0\2\0\4\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test index 63782c8b94d4a..8220361df6cfa 100644 --- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test +++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test @@ -1,6 +1,5 @@ -// Header RUN: printf '\377lprofR\201' > %t -RUN: printf '\0\0\0\0\0\0\0\12' >> %t +RUN: printf '\0\0\0\0\0\0\0\11' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\2' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -13,8 +12,6 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t RUN: printf '\0\0\0\0\3\0\0\0' >> %t RUN: printf '\0\0\0\0\2\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\134\370\302\114\333\030\275\254' >> %t RUN: printf '\0\0\0\0\0\0\0\1' >> %t @@ -23,8 +20,9 @@ RUN: printf '\3\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\3' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\344\023\165\112\031\035\265\067' >> %t RUN: printf '\0\0\0\0\0\0\0\2' >> %t @@ -33,8 +31,9 @@ RUN: printf '\2\xff\xff\xd3' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\2' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\023' >> %t RUN: printf '\0\0\0\0\0\0\0\067' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test index e9569bec1178b..9352ae132380d 100644 --- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test +++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test @@ -1,5 +1,5 @@ RUN: printf '\201Rforpl\377' > %t -RUN: printf '\12\0\0\0\0\0\0\0' >> %t +RUN: printf '\11\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\2\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,8 +12,6 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t RUN: printf '\0\0\0\3\0\0\0\0' >> %t RUN: printf '\0\0\0\2\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\254\275\030\333\114\302\370\134' >> %t RUN: printf '\1\0\0\0\0\0\0\0' >> %t @@ -22,8 +20,9 @@ RUN: printf '\0\0\0\3' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\3\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\067\265\035\031\112\165\023\344' >> %t RUN: printf '\02\0\0\0\0\0\0\0' >> %t @@ -32,8 +31,9 @@ RUN: printf '\xd3\xff\xff\2' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0' >> %t RUN: printf '\2\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\023\0\0\0\0\0\0\0' >> %t RUN: printf '\067\0\0\0\0\0\0\0' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test index 0bc579eec58ab..c3e995add6ff2 100644 --- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test +++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test @@ -1,5 +1,5 @@ RUN: printf '\377lprofr\201' > %t -RUN: printf '\0\0\0\0\0\0\0\12' >> %t +RUN: printf '\0\0\0\0\0\0\0\11' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\2' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,8 +12,6 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t RUN: printf '\0\0\0\3\0\4\0\0' >> %t RUN: printf '\0\0\0\2\0\4\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\134\370\302\114\333\030\275\254' >> %t RUN: printf '\0\0\0\0\0\0\0\1' >> %t @@ -22,8 +20,9 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\3' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\344\023\165\112\031\035\265\067' >> %t RUN: printf '\0\0\0\0\0\0\0\02' >> %t @@ -32,8 +31,9 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\02' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\1' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\023' >> %t RUN: printf '\0\0\0\0\0\0\0\067' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test index ca9ea54c3f014..0b3ef2a89abe5 100644 --- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test +++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t -RUN: printf '\12\0\0\0\0\0\0\0' >> %t +RUN: printf '\11\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\2\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t @@ -12,8 +12,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t RUN: printf '\0\0\4\0\3\0\0\0' >> %t RUN: printf '\0\0\4\0\2\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\254\275\030\333\114\302\370\134' >> %t RUN: printf '\1\0\0\0\0\0\0\0' >> %t @@ -22,8 +20,9 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\3\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\067\265\035\031\112\165\023\344' >> %t RUN: printf '\02\0\0\0\0\0\0\0' >> %t @@ -32,8 +31,9 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\0\0\0\0\0\0\0\0' >> %t RUN: printf '\02\0\0\0' >> %t -RUN: printf '\0\0\0\0\0\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\1\0\0\0' >> %t +RUN: printf '\0\0\0\0' >> %t RUN: printf '\023\0\0\0\0\0\0\0' >> %t RUN: printf '\067\0\0\0\0\0\0\0' >> %t diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test index 70a4210dea9f8..f4a9aa8e1bbc3 100644 --- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test +++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test @@ -1,5 +1,5 @@ RUN: printf '\201rforpl\377' > %t-foo.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw @@ -12,8 +12,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw @@ -28,7 +26,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw RUN: printf '\201rforpl\377' > %t-bar.profraw -RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw +RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw @@ -41,8 +39,6 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw -RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw From 386aa7b16977150da917a78423fd05cb19609850 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Wed, 21 Feb 2024 22:52:02 -0800 Subject: [PATCH 183/351] [mlir][Vector] Replace `vector.shuffle` with `vector.interleave` in vector narrow type emulation (#82550) This PR replaces the generation of `vector.shuffle` with `vector.interleave` in the i4 conversions in vector narrow type emulation. The multi dimensional semantics of `vector.interleave` allow us to enable these conversion emulations also for multi dimensional vectors. --- .../Transforms/VectorEmulateNarrowType.cpp | 27 ++---- .../Vector/vector-rewrite-narrow-types.mlir | 82 +++++++++++++------ 2 files changed, 68 insertions(+), 41 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index 36fb66708407b..fc11ae63e718a 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -724,9 +724,8 @@ BitCastRewriter::BitCastRewriter(VectorType sourceVectorType, static LogicalResult commonConversionPrecondition(PatternRewriter &rewriter, VectorType preconditionType, Operation *op) { - if (!preconditionType || preconditionType.getRank() != 1 || - preconditionType.isScalable()) - return rewriter.notifyMatchFailure(op, "scalable or >1-D vector"); + if (!preconditionType || preconditionType.isScalable()) + return rewriter.notifyMatchFailure(op, "scalable vector"); // TODO: consider relaxing this restriction in the future if we find ways // to really work with subbyte elements across the MLIR/LLVM boundary. @@ -743,6 +742,9 @@ LogicalResult BitCastRewriter::commonPrecondition(PatternRewriter &rewriter, if (!enumerator.sourceVectorType || !enumerator.targetVectorType) return rewriter.notifyMatchFailure(op, "types are not vector"); + if (!preconditionType || preconditionType.getRank() != 1) + return rewriter.notifyMatchFailure(op, "unsupported >1-D vector"); + return commonConversionPrecondition(rewriter, preconditionType, op); } @@ -855,7 +857,6 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc, "Expected i4 type"); // 1. Generate a bitcast vector -> vector. - int64_t vecDimSize = srcVecType.getShape().back(); SmallVector i8VecShape = llvm::to_vector(srcVecType.getShape()); constexpr int64_t i4Toi8BitwidthFactor = 2; i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor; @@ -871,16 +872,8 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc, Value low = rewriter.create(loc, shl, shiftValues); Value high = rewriter.create(loc, i8Vector, shiftValues); - // 3. Interleave low and high i8 elements using a shuffle. - SmallVector interleaveMaskValues; - interleaveMaskValues.reserve(vecDimSize); - for (int i = 0, end = vecDimSize / 2; i < end; ++i) { - interleaveMaskValues.push_back(i); - interleaveMaskValues.push_back(i + (vecDimSize / 2)); - } - - return rewriter.create( - loc, low, high, rewriter.getI64ArrayAttr(interleaveMaskValues)); + // 3. Interleave low and high i8 elements. + return rewriter.create(loc, low, high); } namespace { @@ -1008,8 +1001,7 @@ struct RewriteExtOfBitCast : OpRewritePattern { /// %1 = arith.shli %0, 4 : vector<4xi8> /// %2 = arith.shrsi %1, 4 : vector<4xi8> /// %3 = arith.shrsi %0, 4 : vector<4xi8> -/// %4 = vector.shuffle %2, %3 [0, 4, 1, 5, 2, 6, 3, 7] -/// : vector<4xi8>, vector<4xi8> +/// %4 = vector.interleave %2, %3 : vector<4xi8> /// %5 = arith.extsi %4 : vector<8xi8> to vector<8xi32> /// /// arith.sitofp %in : vector<8xi4> to vector<8xf32> @@ -1018,8 +1010,7 @@ struct RewriteExtOfBitCast : OpRewritePattern { /// %1 = arith.shli %0, 4 : vector<4xi8> /// %2 = arith.shrsi %1, 4 : vector<4xi8> /// %3 = arith.shrsi %0, 4 : vector<4xi8> -/// %4 = vector.shuffle %2, %3 [0, 4, 1, 5, 2, 6, 3, 7] -/// : vector<4xi8>, vector<4xi8> +/// %4 = vector.interleave %2, %3 : vector<4xi8> /// %5 = arith.sitofp %4 : vector<8xi8> to vector<8xf32> /// template diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir index 02063a81664b8..94e78ce40a3c1 100644 --- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir +++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir @@ -195,53 +195,89 @@ func.func @f3ext(%a: vector<5xi8>) -> vector<8xi17> { // CHECK-LABEL: func.func @aligned_extsi( func.func @aligned_extsi(%a: vector<8xi4>) -> vector<8xi32> { - // CHECK: arith.shli - // CHECK: arith.shrsi - // CHECK: arith.shrsi - // CHECK: vector.shuffle - // CHECK: arith.extsi %{{.*}} : vector<8xi8> to vector<8xi32> +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> +// CHECK: %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> %0 = arith.extsi %a : vector<8xi4> to vector<8xi32> return %0 : vector<8xi32> } +// CHECK-LABEL: func.func @aligned_extsi_2d( +func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8> +// CHECK: %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> + %0 = arith.extsi %a : vector<8x32xi4> to vector<8x32xi32> + return %0 : vector<8x32xi32> +} + // CHECK-LABEL: func.func @aligned_extsi_base_case( func.func @aligned_extsi_base_case(%a: vector<8xi4>) -> vector<8xi8> { - // CHECK: arith.shli - // CHECK: arith.shrsi - // CHECK: arith.shrsi - // CHECK: vector.shuffle - // CHECK-NOT: arith.extsi +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> %0 = arith.extsi %a : vector<8xi4> to vector<8xi8> return %0 : vector<8xi8> } // CHECK-LABEL: func.func @aligned_sitofp( func.func @aligned_sitofp(%a: vector<8xi4>) -> vector<8xf32> { - // CHECK: arith.shli - // CHECK: arith.shrsi - // CHECK: arith.shrsi - // CHECK: shuffle - // CHECK: arith.sitofp %{{.*}} : vector<8xi8> to vector<8xf32> +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> +// CHECK: %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8xi8> to vector<8xf32> %0 = arith.sitofp %a : vector<8xi4> to vector<8xf32> return %0 : vector<8xf32> } +// CHECK-LABEL: func.func @aligned_sitofp_2d( +func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8> +// CHECK: %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xf32> + %0 = arith.sitofp %a : vector<8x32xi4> to vector<8x32xf32> + return %0 : vector<8x32xf32> +} + // CHECK-LABEL: func.func @i4_transpose( -// CHECK-SAME: %[[A:[0-9a-z]*]] func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> { - // CHECK: %[[EXT:.*]] = arith.extsi %[[A]] : vector<8x16xi4> to vector<8x16xi8> - // CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> - // CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi4> +// CHECK-SAME: %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> { +// CHECK: %[[EXT:.*]] = vector.interleave +// CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> +// CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi4> %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4> return %0 : vector<16x8xi4> } // CHECK-LABEL: func.func @i7_transpose( -// CHECK-SAME: %[[A:[0-9a-z]*]] func.func @i7_transpose(%a: vector<8x16xi7>) -> vector<16x8xi7> { - // CHECK: %[[EXT:.*]] = arith.extsi %[[A]] : vector<8x16xi7> to vector<8x16xi8> - // CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> - // CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi7> +// CHECK-SAME: %[[IN:.*]]: vector<8x16xi7>) -> vector<16x8xi7> { +// CHECK: %[[EXT:.*]] = arith.extsi %[[IN]] : vector<8x16xi7> to vector<8x16xi8> +// CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> +// CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi7> %0 = vector.transpose %a, [1, 0] : vector<8x16xi7> to vector<16x8xi7> return %0 : vector<16x8xi7> } From 675791335285fa86434dc46e5c92f543e0e79d19 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 21 Feb 2024 22:59:03 -0800 Subject: [PATCH 184/351] [lldb][test] Fix PythonDataObjectsTest This is using `FileSystem::Instance()` w/o calling `FileSystem::Initialize()`. Use `SubsystemRAII` to do that. --- .../Python/PythonDataObjectsTests.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp index a4db4627f935b..b90fbb7830995 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp @@ -11,6 +11,7 @@ #include "Plugins/ScriptInterpreter/Python/PythonDataObjects.h" #include "Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h" +#include "TestingSupport/SubsystemRAII.h" #include "lldb/Host/File.h" #include "lldb/Host/FileSystem.h" #include "lldb/Host/HostInfo.h" @@ -26,6 +27,8 @@ using namespace lldb_private::python; using llvm::Expected; class PythonDataObjectsTest : public PythonTestSuite { + SubsystemRAII subsystems; + public: void SetUp() override { PythonTestSuite::SetUp(); @@ -209,8 +212,8 @@ TEST_F(PythonDataObjectsTest, TestPythonBoolean) { }; // Test PythonBoolean constructed from long integer values. - test_from_long(0); // Test 'false' value. - test_from_long(1); // Test 'true' value. + test_from_long(0); // Test 'false' value. + test_from_long(1); // Test 'true' value. test_from_long(~0); // Any value != 0 is 'true'. } @@ -811,7 +814,8 @@ main = foo testing::ContainsRegex("line 7, in baz"), testing::ContainsRegex("ZeroDivisionError"))))); -#if !((defined(_WIN32) || defined(_WIN64)) && (defined(__aarch64__) || defined(_M_ARM64))) +#if !((defined(_WIN32) || defined(_WIN64)) && \ + (defined(__aarch64__) || defined(_M_ARM64))) static const char script2[] = R"( class MyError(Exception): From 6676f67e3103bb6779d226de6bb4f0f8f8ab99f2 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Thu, 22 Feb 2024 07:20:47 +0000 Subject: [PATCH 185/351] [mlir][Bazel] Remove stub target which is not needed anymore. --- .../llvm-project-overlay/mlir/BUILD.bazel | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index bb7a34ef76772..694602b1a7cbf 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5540,7 +5540,6 @@ cc_library( ":SCFDialect", ":SPIRVDialect", ":SPIRVTarget", - ":SerializeToCubin_stub", ":SideEffectInterfaces", ":Support", ":ToLLVMIRTranslation", @@ -5579,44 +5578,6 @@ cc_library( ]), ) -write_file( - name = "SerializeToCubin_stub_cc", - out = "SerializeToCubin_stub.cc", - content = [ - """ -#include "mlir/Dialect/GPU/Transforms/Passes.h" - -// Provide a weak registration stub in case the real SerializeToCubin is not -// linked in. - -#if defined(_MSC_VER) -// This might not work correctly, but it avoids a compilation error because -// MSVC does not support __attribute__((weak)). -void mlir::registerGpuSerializeToCubinPass() {} -#else -__attribute__((weak)) void mlir::registerGpuSerializeToCubinPass() {} -#endif -""", - ], -) - -cc_library( - name = "SerializeToCubin_stub", - srcs = [":SerializeToCubin_stub_cc"], - hdrs = glob(["include/mlir/Dialect/GPU/Transforms/*.h"]), - includes = ["include"], - deps = [ - ":GPUDialect", - ":GPUPassIncGen", - ":IR", - ":Pass", - ":SPIRVDialect", - ":Support", - ":VectorDialect", - "//llvm:Support", - ], -) - td_library( name = "GPUTransformOpsTdFiles", srcs = [ @@ -13193,7 +13154,6 @@ cc_library( ], ) - ##---------------------------------------------------------------------------## # Allocation interfaces ##---------------------------------------------------------------------------## From bc1c86b810e518a8e3fa90d5c26908c43788873d Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Thu, 22 Feb 2024 07:24:46 +0000 Subject: [PATCH 186/351] [mlir][Bazel] Also remove SerializeToCubin target. --- .../llvm-project-overlay/mlir/BUILD.bazel | 30 +------------------ 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 694602b1a7cbf..a34874efa5b19 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3231,9 +3231,7 @@ cc_library( ":Transforms", ":VectorToLLVM", ":VectorTransforms", - ] + if_cuda_available([ - ":SerializeToCubin", - ]), + ], ) ##---------------------------------------------------------------------------## @@ -5504,9 +5502,6 @@ cc_library( "lib/Dialect/GPU/Transforms/*.cpp", "lib/Dialect/GPU/Transforms/*.h", ], - exclude = [ - "lib/Dialect/GPU/Transforms/SerializeToCubin.cpp", - ], ), hdrs = glob(["include/mlir/Dialect/GPU/Transforms/*.h"]), includes = ["include"], @@ -5556,28 +5551,6 @@ cc_library( ]), ) -cc_library( - name = "SerializeToCubin", - srcs = [ - "lib/Dialect/GPU/Transforms/SerializeToCubin.cpp", - ], - local_defines = if_cuda_available(["MLIR_GPU_TO_CUBIN_PASS_ENABLE"]), - deps = [ - ":GPUDialect", - ":GPUPassIncGen", - ":GPUTransforms", - ":NVVMDialect", - ":NVVMToLLVMIRTranslation", - ":Pass", - ":Support", - ":ToLLVMIRTranslation", - "//llvm:Support", - ] + if_cuda_available([ - "@cuda//:cuda_headers", - "@cuda//:libcuda", - ]), -) - td_library( name = "GPUTransformOpsTdFiles", srcs = [ @@ -9190,7 +9163,6 @@ cc_binary( ":Pass", ":QuantOps", ":SCFToGPU", - ":SerializeToCubin", ":Support", ":Transforms", "//llvm:AllTargetsCodeGens", From 7e97ae35ae2d1c38d149e670139a538bdba86e93 Mon Sep 17 00:00:00 2001 From: Yeting Kuo <46629943+yetingk@users.noreply.github.com> Date: Thu, 22 Feb 2024 15:51:19 +0800 Subject: [PATCH 187/351] [RISCV] Teach RISCVMakeCompressible handle Zca/Zcf/Zce/Zcd. (#81844) Make targets which don't have C but have Zca/Zcf/Zce/Zcd benefit from this pass. --- .../Target/RISCV/RISCVMakeCompressible.cpp | 31 +- llvm/lib/Target/RISCV/RISCVSubtarget.h | 4 + llvm/test/CodeGen/RISCV/make-compressible.mir | 499 +++++++++++++----- 3 files changed, 400 insertions(+), 134 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index ff21fe1d40646..af864ba0fbc46 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) { // Return true if MI is a load for which there exists a compressed version. static bool isCompressibleLoad(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget(); - const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) || - Opcode == RISCV::LD || Opcode == RISCV::FLD; + switch (MI.getOpcode()) { + default: + return false; + case RISCV::LW: + case RISCV::LD: + return STI.hasStdExtCOrZca(); + case RISCV::FLW: + return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); + case RISCV::FLD: + return STI.hasStdExtCOrZcd(); + } } // Return true if MI is a store for which there exists a compressed version. static bool isCompressibleStore(const MachineInstr &MI) { const RISCVSubtarget &STI = MI.getMF()->getSubtarget(); - const unsigned Opcode = MI.getOpcode(); - return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) || - Opcode == RISCV::SD || Opcode == RISCV::FSD; + switch (MI.getOpcode()) { + default: + return false; + case RISCV::SW: + case RISCV::SD: + return STI.hasStdExtCOrZca(); + case RISCV::FSW: + return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); + case RISCV::FSD: + return STI.hasStdExtCOrZcd(); + } } // Find a single register and/or large offset which, if compressible, would @@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) { const RISCVInstrInfo &TII = *STI.getInstrInfo(); // This optimization only makes sense if compressed instructions are emitted. - // FIXME: Support Zca, Zcf, Zcd granularity. - if (!STI.hasStdExtC()) + if (!STI.hasStdExtCOrZca()) return false; for (MachineBasicBlock &MBB : Fn) { diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b60d7aff22a0..9ebf278d6749f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -143,6 +143,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { #include "RISCVGenSubtargetInfo.inc" bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; } + bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; } + bool hasStdExtCOrZcfOrZce() const { + return HasStdExtC || HasStdExtZcf || HasStdExtZce; + } bool hasStdExtZvl() const { return ZvlLen != 0; } bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; } bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; } diff --git a/llvm/test/CodeGen/RISCV/make-compressible.mir b/llvm/test/CodeGen/RISCV/make-compressible.mir index 2105a13bc8c7b..03da38a6863e7 100644 --- a/llvm/test/CodeGen/RISCV/make-compressible.mir +++ b/llvm/test/CodeGen/RISCV/make-compressible.mir @@ -1,8 +1,14 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=riscv32 -mattr=+c,+f,+d -simplify-mir \ -# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV32 %s +# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32C %s # RUN: llc -o - %s -mtriple=riscv64 -mattr=+c,+f,+d -simplify-mir \ -# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV64 %s +# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64C %s +# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zcf -simplify-mir \ +# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCF %s +# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zca -simplify-mir \ +# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCA %s +# RUN: llc -o - %s -mtriple=riscv64 -mattr=+d,+zca -simplify-mir \ +# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64ZCA %s --- | define void @store_common_value(ptr %a, ptr %b, ptr %c) #0 { @@ -288,7 +294,7 @@ ret { double, double } %3 } - attributes #0 = { minsize "target-features"="+c,+f,+d" } + attributes #0 = { minsize } ... --- @@ -306,6 +312,7 @@ body: | ; RV32-NEXT: SW $x13, killed renamable $x11, 0 :: (store (s32) into %ir.b) ; RV32-NEXT: SW $x13, killed renamable $x12, 0 :: (store (s32) into %ir.c) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_value ; RV64: liveins: $x10, $x11, $x12 ; RV64-NEXT: {{ $}} @@ -327,14 +334,15 @@ body: | bb.0.entry: liveins: $x10, $x11, $x12, $f16_f - ; RV32-LABEL: name: store_common_value_float - ; RV32: liveins: $x10, $x11, $x12, $f16_f - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f - ; RV32-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a) - ; RV32-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b) - ; RV32-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c) - ; RV32-NEXT: PseudoRET + ; RV32C-LABEL: name: store_common_value_float + ; RV32C: liveins: $x10, $x11, $x12, $f16_f + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f + ; RV32C-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a) + ; RV32C-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b) + ; RV32C-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c) + ; RV32C-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_value_float ; RV64: liveins: $x10, $x11, $x12, $f16_f ; RV64-NEXT: {{ $}} @@ -342,6 +350,23 @@ body: | ; RV64-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b) ; RV64-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c) ; RV64-NEXT: PseudoRET + ; + ; RV32ZCF-LABEL: name: store_common_value_float + ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_f + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f + ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a) + ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b) + ; RV32ZCF-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c) + ; RV32ZCF-NEXT: PseudoRET + ; + ; RV32ZCA-LABEL: name: store_common_value_float + ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_f + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a) + ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b) + ; RV32ZCA-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c) + ; RV32ZCA-NEXT: PseudoRET FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a) FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b) FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c) @@ -355,22 +380,47 @@ body: | bb.0.entry: liveins: $x10, $x11, $x12, $f16_d - ; RV32-LABEL: name: store_common_value_double - ; RV32: liveins: $x10, $x11, $x12, $f16_d - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d - ; RV32-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) - ; RV32-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) - ; RV32-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) - ; RV32-NEXT: PseudoRET - ; RV64-LABEL: name: store_common_value_double - ; RV64: liveins: $x10, $x11, $x12, $f16_d - ; RV64-NEXT: {{ $}} - ; RV64-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d - ; RV64-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) - ; RV64-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) - ; RV64-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) - ; RV64-NEXT: PseudoRET + ; RV32C-LABEL: name: store_common_value_double + ; RV32C: liveins: $x10, $x11, $x12, $f16_d + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d + ; RV32C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV32C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) + ; RV32C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) + ; RV32C-NEXT: PseudoRET + ; + ; RV64C-LABEL: name: store_common_value_double + ; RV64C: liveins: $x10, $x11, $x12, $f16_d + ; RV64C-NEXT: {{ $}} + ; RV64C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d + ; RV64C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV64C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) + ; RV64C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) + ; RV64C-NEXT: PseudoRET + ; + ; RV32ZCF-LABEL: name: store_common_value_double + ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_d + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) + ; RV32ZCF-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) + ; RV32ZCF-NEXT: PseudoRET + ; + ; RV32ZCA-LABEL: name: store_common_value_double + ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_d + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) + ; RV32ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) + ; RV32ZCA-NEXT: PseudoRET + ; + ; RV64ZCA-LABEL: name: store_common_value_double + ; RV64ZCA: liveins: $x10, $x11, $x12, $f16_d + ; RV64ZCA-NEXT: {{ $}} + ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) + ; RV64ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) + ; RV64ZCA-NEXT: PseudoRET FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b) FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c) @@ -395,6 +445,7 @@ body: | ; RV32-NEXT: renamable $x10 = ADDI $x0, 5 ; RV32-NEXT: SW killed renamable $x10, killed $x11, 0 :: (volatile store (s32) into %ir.p) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_ptr ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -432,6 +483,7 @@ body: | ; RV32-NEXT: SW killed renamable $x10, $x11, 0 :: (volatile store (s32) into %ir.p) ; RV32-NEXT: SW killed $x11, $x11, 0 :: (volatile store (s32) into %ir.q) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_ptr_self ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -457,14 +509,15 @@ body: | bb.0.entry: liveins: $x16, $f10_f, $f11_f, $f12_f - ; RV32-LABEL: name: store_common_ptr_float - ; RV32: liveins: $x16, $f10_f, $f11_f, $f12_f - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x10 = ADDI $x16, 0 - ; RV32-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p) - ; RV32-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p) - ; RV32-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p) - ; RV32-NEXT: PseudoRET + ; RV32C-LABEL: name: store_common_ptr_float + ; RV32C: liveins: $x16, $f10_f, $f11_f, $f12_f + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x10 = ADDI $x16, 0 + ; RV32C-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p) + ; RV32C-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p) + ; RV32C-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p) + ; RV32C-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_ptr_float ; RV64: liveins: $x16, $f10_f, $f11_f, $f12_f ; RV64-NEXT: {{ $}} @@ -472,6 +525,23 @@ body: | ; RV64-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p) ; RV64-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p) ; RV64-NEXT: PseudoRET + ; + ; RV32ZCF-LABEL: name: store_common_ptr_float + ; RV32ZCF: liveins: $x16, $f10_f, $f11_f, $f12_f + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0 + ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p) + ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p) + ; RV32ZCF-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p) + ; RV32ZCF-NEXT: PseudoRET + ; + ; RV32ZCA-LABEL: name: store_common_ptr_float + ; RV32ZCA: liveins: $x16, $f10_f, $f11_f, $f12_f + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p) + ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p) + ; RV32ZCA-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p) + ; RV32ZCA-NEXT: PseudoRET FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p) FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p) FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p) @@ -485,22 +555,47 @@ body: | bb.0.entry: liveins: $x16, $f10_d, $f11_d, $f12_d - ; RV32-LABEL: name: store_common_ptr_double - ; RV32: liveins: $x16, $f10_d, $f11_d, $f12_d - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x10 = ADDI $x16, 0 - ; RV32-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p) - ; RV32-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p) - ; RV32-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p) - ; RV32-NEXT: PseudoRET - ; RV64-LABEL: name: store_common_ptr_double - ; RV64: liveins: $x16, $f10_d, $f11_d, $f12_d - ; RV64-NEXT: {{ $}} - ; RV64-NEXT: $x10 = ADDI $x16, 0 - ; RV64-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p) - ; RV64-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p) - ; RV64-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p) - ; RV64-NEXT: PseudoRET + ; RV32C-LABEL: name: store_common_ptr_double + ; RV32C: liveins: $x16, $f10_d, $f11_d, $f12_d + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x10 = ADDI $x16, 0 + ; RV32C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p) + ; RV32C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p) + ; RV32C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p) + ; RV32C-NEXT: PseudoRET + ; + ; RV64C-LABEL: name: store_common_ptr_double + ; RV64C: liveins: $x16, $f10_d, $f11_d, $f12_d + ; RV64C-NEXT: {{ $}} + ; RV64C-NEXT: $x10 = ADDI $x16, 0 + ; RV64C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p) + ; RV64C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p) + ; RV64C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p) + ; RV64C-NEXT: PseudoRET + ; + ; RV32ZCF-LABEL: name: store_common_ptr_double + ; RV32ZCF: liveins: $x16, $f10_d, $f11_d, $f12_d + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32ZCF-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32ZCF-NEXT: PseudoRET + ; + ; RV32ZCA-LABEL: name: store_common_ptr_double + ; RV32ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32ZCA-NEXT: PseudoRET + ; + ; RV64ZCA-LABEL: name: store_common_ptr_double + ; RV64ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d + ; RV64ZCA-NEXT: {{ $}} + ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV64ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV64ZCA-NEXT: PseudoRET FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p) FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) @@ -522,6 +617,7 @@ body: | ; RV32-NEXT: dead renamable $x10 = LW $x11, 0 :: (volatile load (s32) from %ir.p) ; RV32-NEXT: dead renamable $x10 = LW killed $x11, 0 :: (volatile load (s32) from %ir.p) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: load_common_ptr ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -543,14 +639,15 @@ body: | bb.0.entry: liveins: $x16 - ; RV32-LABEL: name: load_common_ptr_float - ; RV32: liveins: $x16 - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x10 = ADDI $x16, 0 - ; RV32-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g) - ; RV32-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1) - ; RV32-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2) - ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; RV32C-LABEL: name: load_common_ptr_float + ; RV32C: liveins: $x16 + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x10 = ADDI $x16, 0 + ; RV32C-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g) + ; RV32C-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1) + ; RV32C-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2) + ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; ; RV64-LABEL: name: load_common_ptr_float ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -558,6 +655,23 @@ body: | ; RV64-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1) ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2) ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; + ; RV32ZCF-LABEL: name: load_common_ptr_float + ; RV32ZCF: liveins: $x16 + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0 + ; RV32ZCF-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g) + ; RV32ZCF-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1) + ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2) + ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; + ; RV32ZCA-LABEL: name: load_common_ptr_float + ; RV32ZCA: liveins: $x16 + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g) + ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1) + ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2) + ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g) renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1) renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2) @@ -571,22 +685,47 @@ body: | bb.0.entry: liveins: $x16 - ; RV32-LABEL: name: load_common_ptr_double - ; RV32: liveins: $x16 - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x10 = ADDI $x16, 0 - ; RV32-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g) - ; RV32-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1) - ; RV32-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2) - ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d - ; RV64-LABEL: name: load_common_ptr_double - ; RV64: liveins: $x16 - ; RV64-NEXT: {{ $}} - ; RV64-NEXT: $x10 = ADDI $x16, 0 - ; RV64-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g) - ; RV64-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1) - ; RV64-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2) - ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; RV32C-LABEL: name: load_common_ptr_double + ; RV32C: liveins: $x16 + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x10 = ADDI $x16, 0 + ; RV32C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g) + ; RV32C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1) + ; RV32C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2) + ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV64C-LABEL: name: load_common_ptr_double + ; RV64C: liveins: $x16 + ; RV64C-NEXT: {{ $}} + ; RV64C-NEXT: $x10 = ADDI $x16, 0 + ; RV64C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g) + ; RV64C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1) + ; RV64C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2) + ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV32ZCF-LABEL: name: load_common_ptr_double + ; RV32ZCF: liveins: $x16 + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g) + ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1) + ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2) + ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV32ZCA-LABEL: name: load_common_ptr_double + ; RV32ZCA: liveins: $x16 + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g) + ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1) + ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2) + ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV64ZCA-LABEL: name: load_common_ptr_double + ; RV64ZCA: liveins: $x16 + ; RV64ZCA-NEXT: {{ $}} + ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g) + ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1) + ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2) + ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g) renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1) renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2) @@ -613,6 +752,7 @@ body: | ; RV32-NEXT: renamable $x11 = ADDI $x0, 7 ; RV32-NEXT: SW killed renamable $x11, killed $x12, 28 :: (volatile store (s32) into %ir.3) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_large_offset ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -644,15 +784,16 @@ body: | bb.0.entry: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f - ; RV32-LABEL: name: store_large_offset_float - ; RV32: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x11 = ADDI $x10, 384 - ; RV32-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0) - ; RV32-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1) - ; RV32-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2) - ; RV32-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3) - ; RV32-NEXT: PseudoRET + ; RV32C-LABEL: name: store_large_offset_float + ; RV32C: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x11 = ADDI $x10, 384 + ; RV32C-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0) + ; RV32C-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1) + ; RV32C-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2) + ; RV32C-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3) + ; RV32C-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_large_offset_float ; RV64: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f ; RV64-NEXT: {{ $}} @@ -661,6 +802,25 @@ body: | ; RV64-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2) ; RV64-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3) ; RV64-NEXT: PseudoRET + ; + ; RV32ZCF-LABEL: name: store_large_offset_float + ; RV32ZCF: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384 + ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0) + ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1) + ; RV32ZCF-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2) + ; RV32ZCF-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3) + ; RV32ZCF-NEXT: PseudoRET + ; + ; RV32ZCA-LABEL: name: store_large_offset_float + ; RV32ZCA: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0) + ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1) + ; RV32ZCA-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2) + ; RV32ZCA-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3) + ; RV32ZCA-NEXT: PseudoRET FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0) FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1) FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2) @@ -675,24 +835,52 @@ body: | bb.0.entry: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d - ; RV32-LABEL: name: store_large_offset_double - ; RV32: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x11 = ADDI $x10, 768 - ; RV32-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0) - ; RV32-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1) - ; RV32-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2) - ; RV32-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3) - ; RV32-NEXT: PseudoRET - ; RV64-LABEL: name: store_large_offset_double - ; RV64: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d - ; RV64-NEXT: {{ $}} - ; RV64-NEXT: $x11 = ADDI $x10, 768 - ; RV64-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0) - ; RV64-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1) - ; RV64-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2) - ; RV64-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3) - ; RV64-NEXT: PseudoRET + ; RV32C-LABEL: name: store_large_offset_double + ; RV32C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x11 = ADDI $x10, 768 + ; RV32C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0) + ; RV32C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1) + ; RV32C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2) + ; RV32C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3) + ; RV32C-NEXT: PseudoRET + ; + ; RV64C-LABEL: name: store_large_offset_double + ; RV64C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d + ; RV64C-NEXT: {{ $}} + ; RV64C-NEXT: $x11 = ADDI $x10, 768 + ; RV64C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0) + ; RV64C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1) + ; RV64C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2) + ; RV64C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3) + ; RV64C-NEXT: PseudoRET + ; + ; RV32ZCF-LABEL: name: store_large_offset_double + ; RV32ZCF: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0) + ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1) + ; RV32ZCF-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2) + ; RV32ZCF-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3) + ; RV32ZCF-NEXT: PseudoRET + ; + ; RV32ZCA-LABEL: name: store_large_offset_double + ; RV32ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0) + ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1) + ; RV32ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2) + ; RV32ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3) + ; RV32ZCA-NEXT: PseudoRET + ; + ; RV64ZCA-LABEL: name: store_large_offset_double + ; RV64ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d + ; RV64ZCA-NEXT: {{ $}} + ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0) + ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1) + ; RV64ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2) + ; RV64ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3) + ; RV64ZCA-NEXT: PseudoRET FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0) FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1) FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2) @@ -716,6 +904,7 @@ body: | ; RV32-NEXT: dead renamable $x11 = LW $x12, 24 :: (volatile load (s32) from %ir.2) ; RV32-NEXT: dead renamable $x10 = LW killed $x12, 28 :: (volatile load (s32) from %ir.3) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: load_large_offset ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -739,14 +928,15 @@ body: | bb.0.entry: liveins: $x10 - ; RV32-LABEL: name: load_large_offset_float - ; RV32: liveins: $x10 - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x11 = ADDI $x10, 384 - ; RV32-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx) - ; RV32-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1) - ; RV32-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2) - ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; RV32C-LABEL: name: load_large_offset_float + ; RV32C: liveins: $x10 + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x11 = ADDI $x10, 384 + ; RV32C-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx) + ; RV32C-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1) + ; RV32C-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2) + ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; ; RV64-LABEL: name: load_large_offset_float ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -754,6 +944,23 @@ body: | ; RV64-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1) ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2) ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; + ; RV32ZCF-LABEL: name: load_large_offset_float + ; RV32ZCF: liveins: $x10 + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384 + ; RV32ZCF-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx) + ; RV32ZCF-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1) + ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2) + ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f + ; + ; RV32ZCA-LABEL: name: load_large_offset_float + ; RV32ZCA: liveins: $x10 + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx) + ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1) + ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2) + ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx) renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1) renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2) @@ -767,22 +974,47 @@ body: | bb.0.entry: liveins: $x10 - ; RV32-LABEL: name: load_large_offset_double - ; RV32: liveins: $x10 - ; RV32-NEXT: {{ $}} - ; RV32-NEXT: $x11 = ADDI $x10, 768 - ; RV32-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx) - ; RV32-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1) - ; RV32-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2) - ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d - ; RV64-LABEL: name: load_large_offset_double - ; RV64: liveins: $x10 - ; RV64-NEXT: {{ $}} - ; RV64-NEXT: $x11 = ADDI $x10, 768 - ; RV64-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx) - ; RV64-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1) - ; RV64-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2) - ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; RV32C-LABEL: name: load_large_offset_double + ; RV32C: liveins: $x10 + ; RV32C-NEXT: {{ $}} + ; RV32C-NEXT: $x11 = ADDI $x10, 768 + ; RV32C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx) + ; RV32C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1) + ; RV32C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2) + ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV64C-LABEL: name: load_large_offset_double + ; RV64C: liveins: $x10 + ; RV64C-NEXT: {{ $}} + ; RV64C-NEXT: $x11 = ADDI $x10, 768 + ; RV64C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx) + ; RV64C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1) + ; RV64C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2) + ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV32ZCF-LABEL: name: load_large_offset_double + ; RV32ZCF: liveins: $x10 + ; RV32ZCF-NEXT: {{ $}} + ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx) + ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) + ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2) + ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV32ZCA-LABEL: name: load_large_offset_double + ; RV32ZCA: liveins: $x10 + ; RV32ZCA-NEXT: {{ $}} + ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx) + ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) + ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2) + ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d + ; + ; RV64ZCA-LABEL: name: load_large_offset_double + ; RV64ZCA: liveins: $x10 + ; RV64ZCA-NEXT: {{ $}} + ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx) + ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) + ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2) + ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx) renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2) @@ -801,6 +1033,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: SW $x0, killed renamable $x10, 0 :: (store (s32) into %ir.a) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_value_no_opt ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -822,6 +1055,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: FSW killed renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_value_float_no_opt ; RV64: liveins: $x10, $f16_f ; RV64-NEXT: {{ $}} @@ -843,6 +1077,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: FSD killed renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_value_double_no_opt ; RV64: liveins: $x10, $f16_d ; RV64-NEXT: {{ $}} @@ -865,6 +1100,7 @@ body: | ; RV32-NEXT: renamable $x10 = ADDI $x0, 1 ; RV32-NEXT: SW killed renamable $x10, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_ptr_no_opt ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -888,6 +1124,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: FSW killed renamable $f10_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_ptr_float_no_opt ; RV64: liveins: $x16, $f10_f ; RV64-NEXT: {{ $}} @@ -909,6 +1146,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: FSD killed renamable $f10_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_common_ptr_double_no_opt ; RV64: liveins: $x16, $f10_d ; RV64-NEXT: {{ $}} @@ -930,6 +1168,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x16, 0 :: (volatile load (s32) from %ir.p) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: load_common_ptr_no_opt ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -951,6 +1190,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: renamable $f10_f = FLW killed renamable $x16, 0 :: (load (s32) from %ir.g) ; RV32-NEXT: PseudoRET implicit $f10_f + ; ; RV64-LABEL: name: load_common_ptr_float_no_opt ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -972,6 +1212,7 @@ body: | ; RV32-NEXT: {{ $}} ; RV32-NEXT: renamable $f10_d = FLD killed renamable $x16, 0 :: (load (s64) from %ir.g) ; RV32-NEXT: PseudoRET implicit $f10_d + ; ; RV64-LABEL: name: load_common_ptr_double_no_opt ; RV64: liveins: $x16 ; RV64-NEXT: {{ $}} @@ -996,6 +1237,7 @@ body: | ; RV32-NEXT: renamable $x11 = ADDI $x0, 3 ; RV32-NEXT: SW killed renamable $x11, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_large_offset_no_opt ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -1024,6 +1266,7 @@ body: | ; RV32-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0) ; RV32-NEXT: FSW killed renamable $f11_f, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_large_offset_float_no_opt ; RV64: liveins: $x10, $f10_f, $f11_f ; RV64-NEXT: {{ $}} @@ -1048,6 +1291,7 @@ body: | ; RV32-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0) ; RV32-NEXT: FSD killed renamable $f11_d, killed renamable $x10, 808 :: (volatile store (s64) into %ir.1) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: store_large_offset_double_no_opt ; RV64: liveins: $x10, $f10_d, $f11_d ; RV64-NEXT: {{ $}} @@ -1072,6 +1316,7 @@ body: | ; RV32-NEXT: dead renamable $x11 = LW renamable $x10, 400 :: (volatile load (s32) from %ir.0) ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x10, 404 :: (volatile load (s32) from %ir.1) ; RV32-NEXT: PseudoRET + ; ; RV64-LABEL: name: load_large_offset_no_opt ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -1096,6 +1341,7 @@ body: | ; RV32-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx) ; RV32-NEXT: renamable $f11_f = FLW killed renamable $x10, 404 :: (load (s32) from %ir.arrayidx1) ; RV32-NEXT: PseudoRET implicit $f10_f, implicit $f11_f + ; ; RV64-LABEL: name: load_large_offset_float_no_opt ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} @@ -1120,6 +1366,7 @@ body: | ; RV32-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx) ; RV32-NEXT: renamable $f11_d = FLD killed renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) ; RV32-NEXT: PseudoRET implicit $f10_d, implicit $f11_d + ; ; RV64-LABEL: name: load_large_offset_double_no_opt ; RV64: liveins: $x10 ; RV64-NEXT: {{ $}} From edd4aee4dd9b5b98b2576a6f783e4086173d902a Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 22 Feb 2024 15:57:57 +0800 Subject: [PATCH 188/351] [RISCV] Compute integers once in isSimpleVIDSequence. NFCI (#82590) We need to iterate through the integers twice in isSimpleVIDSequence, so instead of computing them twice just compute them once at the start. This also replaces the individual checks that each element is constant with a single call to BuildVectorSDNode::isConstant. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 64 ++++++++++----------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 75be97ff32bbe..cf0dc36a51b61 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3242,44 +3242,47 @@ static std::optional getExactInteger(const APFloat &APF, // determine whether this is worth generating code for. static std::optional isSimpleVIDSequence(SDValue Op, unsigned EltSizeInBits) { - unsigned NumElts = Op.getNumOperands(); assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR"); + if (!cast(Op)->isConstant()) + return std::nullopt; bool IsInteger = Op.getValueType().isInteger(); std::optional SeqStepDenom; std::optional SeqStepNum, SeqAddend; std::optional> PrevElt; assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits()); - for (unsigned Idx = 0; Idx < NumElts; Idx++) { - // Assume undef elements match the sequence; we just have to be careful - // when interpolating across them. - if (Op.getOperand(Idx).isUndef()) - continue; - uint64_t Val; + // First extract the ops into a list of constant integer values. This may not + // be possible for floats if they're not all representable as integers. + SmallVector> Elts(Op.getNumOperands()); + const unsigned OpSize = Op.getScalarValueSizeInBits(); + for (auto [Idx, Elt] : enumerate(Op->op_values())) { + if (Elt.isUndef()) { + Elts[Idx] = std::nullopt; + continue; + } if (IsInteger) { - // The BUILD_VECTOR must be all constants. - if (!isa(Op.getOperand(Idx))) - return std::nullopt; - Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes(Op.getScalarValueSizeInBits()); + Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes(OpSize); } else { - // The BUILD_VECTOR must be all constants. - if (!isa(Op.getOperand(Idx))) - return std::nullopt; - if (auto ExactInteger = getExactInteger( - cast(Op.getOperand(Idx))->getValueAPF(), - Op.getScalarValueSizeInBits())) - Val = *ExactInteger; - else + auto ExactInteger = + getExactInteger(cast(Elt)->getValueAPF(), OpSize); + if (!ExactInteger) return std::nullopt; + Elts[Idx] = *ExactInteger; } + } + + for (auto [Idx, Elt] : enumerate(Elts)) { + // Assume undef elements match the sequence; we just have to be careful + // when interpolating across them. + if (!Elt) + continue; if (PrevElt) { // Calculate the step since the last non-undef element, and ensure // it's consistent across the entire sequence. unsigned IdxDiff = Idx - PrevElt->second; - int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits); + int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits); // A zero-value value difference means that we're somewhere in the middle // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a @@ -3309,8 +3312,8 @@ static std::optional isSimpleVIDSequence(SDValue Op, } // Record this non-undef element for later. - if (!PrevElt || PrevElt->first != Val) - PrevElt = std::make_pair(Val, Idx); + if (!PrevElt || PrevElt->first != *Elt) + PrevElt = std::make_pair(*Elt, Idx); } // We need to have logged a step for this to count as a legal index sequence. @@ -3319,21 +3322,12 @@ static std::optional isSimpleVIDSequence(SDValue Op, // Loop back through the sequence and validate elements we might have skipped // while waiting for a valid step. While doing this, log any sequence addend. - for (unsigned Idx = 0; Idx < NumElts; Idx++) { - if (Op.getOperand(Idx).isUndef()) + for (auto [Idx, Elt] : enumerate(Elts)) { + if (!Elt) continue; - uint64_t Val; - if (IsInteger) { - Val = Op.getConstantOperandVal(Idx) & - maskTrailingOnes(Op.getScalarValueSizeInBits()); - } else { - Val = *getExactInteger( - cast(Op.getOperand(Idx))->getValueAPF(), - Op.getScalarValueSizeInBits()); - } uint64_t ExpectedVal = (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom; - int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits); + int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits); if (!SeqAddend) SeqAddend = Addend; else if (Addend != SeqAddend) From e899641df2391179e8ec29ca14c53b09ae7ce85c Mon Sep 17 00:00:00 2001 From: martinboehme Date: Thu, 22 Feb 2024 09:00:20 +0100 Subject: [PATCH 189/351] [clang][dataflow] Fix inaccuracies in `buildStmtToBasicBlockMap()`. (#82496) See the comments added to the code for details on the inaccuracies that have now been fixed. The patch adds tests that fail with the old implementation. --- .../FlowSensitive/ControlFlowContext.cpp | 31 +++- .../TypeErasedDataflowAnalysisTest.cpp | 143 ++++++++++++++---- 2 files changed, 140 insertions(+), 34 deletions(-) diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp index c9ebffe6f3780..8aed19544be6a 100644 --- a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp +++ b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp @@ -39,8 +39,35 @@ buildStmtToBasicBlockMap(const CFG &Cfg) { StmtToBlock[Stmt->getStmt()] = Block; } - if (const Stmt *TerminatorStmt = Block->getTerminatorStmt()) - StmtToBlock[TerminatorStmt] = Block; + } + // Some terminator conditions don't appear as a `CFGElement` anywhere else - + // for example, this is true if the terminator condition is a `&&` or `||` + // operator. + // We associate these conditions with the block the terminator appears in, + // but only if the condition has not already appeared as a regular + // `CFGElement`. (The `insert()` below does nothing if the key already exists + // in the map.) + for (const CFGBlock *Block : Cfg) { + if (Block != nullptr) + if (const Stmt *TerminatorCond = Block->getTerminatorCondition()) + StmtToBlock.insert({TerminatorCond, Block}); + } + // Terminator statements typically don't appear as a `CFGElement` anywhere + // else, so we want to associate them with the block that they terminate. + // However, there are some important special cases: + // - The conditional operator is a type of terminator, but it also appears + // as a regular `CFGElement`, and we want to associate it with the block + // in which it appears as a `CFGElement`. + // - The `&&` and `||` operators are types of terminators, but like the + // conditional operator, they can appear as a regular `CFGElement` or + // as a terminator condition (see above). + // We process terminators last to make sure that we only associate them with + // the block they terminate if they haven't previously occurred as a regular + // `CFGElement` or as a terminator condition. + for (const CFGBlock *Block : Cfg) { + if (Block != nullptr) + if (const Stmt *TerminatorStmt = Block->getTerminatorStmt()) + StmtToBlock.insert({TerminatorStmt, Block}); } return StmtToBlock; } diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp index 3bca9cced8d6f..34f9b0b23719f 100644 --- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp @@ -77,17 +77,33 @@ class DataflowAnalysisTest : public Test { return runDataflowAnalysis(*CFCtx, Analysis, Env); } + /// Returns the `CFGBlock` containing `S` (and asserts that it exists). + const CFGBlock *blockForStmt(const Stmt &S) { + const CFGBlock *Block = CFCtx->getStmtToBlock().lookup(&S); + assert(Block != nullptr); + return Block; + } + template const StateT & blockStateForStmt(const std::vector> &BlockStates, - const Stmt *S) { - const CFGBlock *Block = CFCtx->getStmtToBlock().lookup(S); - assert(Block != nullptr); - const std::optional &MaybeState = BlockStates[Block->getBlockID()]; + const Stmt &S) { + const std::optional &MaybeState = + BlockStates[blockForStmt(S)->getBlockID()]; assert(MaybeState.has_value()); return *MaybeState; } + /// Returns the first node that matches `Matcher` (and asserts that the match + /// was successful, i.e. the returned node is not null). + template + const NodeT &matchNode(MatcherT Matcher) { + const auto *Node = selectFirst( + "node", match(Matcher.bind("node"), AST->getASTContext())); + assert(Node != nullptr); + return *Node; + } + std::unique_ptr AST; std::unique_ptr CFCtx; std::unique_ptr DACtx; @@ -130,6 +146,79 @@ TEST_F(DataflowAnalysisTest, DiagnoseFunctionDiagnoserCalledOnEachElement) { " (Lifetime ends)\n"))); } +// Tests for the statement-to-block map. +using StmtToBlockTest = DataflowAnalysisTest; + +TEST_F(StmtToBlockTest, ConditionalOperator) { + std::string Code = R"( + void target(bool b) { + int i = b ? 1 : 0; + } + )"; + ASSERT_THAT_ERROR(runAnalysis( + Code, [](ASTContext &C) { return NoopAnalysis(C); }) + .takeError(), + llvm::Succeeded()); + + const auto &IDecl = matchNode(declStmt(has(varDecl(hasName("i"))))); + const auto &ConditionalOp = + matchNode(conditionalOperator()); + + // The conditional operator should be associated with the same block as the + // `DeclStmt` for `i`. (Specifically, the conditional operator should not be + // associated with the block for which it is the terminator.) + EXPECT_EQ(blockForStmt(IDecl), blockForStmt(ConditionalOp)); +} + +TEST_F(StmtToBlockTest, LogicalAnd) { + std::string Code = R"( + void target(bool b1, bool b2) { + bool b = b1 && b2; + } + )"; + ASSERT_THAT_ERROR(runAnalysis( + Code, [](ASTContext &C) { return NoopAnalysis(C); }) + .takeError(), + llvm::Succeeded()); + + const auto &BDecl = matchNode(declStmt(has(varDecl(hasName("b"))))); + const auto &AndOp = + matchNode(binaryOperator(hasOperatorName("&&"))); + + // The `&&` operator should be associated with the same block as the + // `DeclStmt` for `b`. (Specifically, the `&&` operator should not be + // associated with the block for which it is the terminator.) + EXPECT_EQ(blockForStmt(BDecl), blockForStmt(AndOp)); +} + +TEST_F(StmtToBlockTest, IfStatementWithLogicalAnd) { + std::string Code = R"( + void target(bool b1, bool b2) { + if (b1 && b2) + ; + } + )"; + ASSERT_THAT_ERROR(runAnalysis( + Code, [](ASTContext &C) { return NoopAnalysis(C); }) + .takeError(), + llvm::Succeeded()); + + const auto &If = matchNode(ifStmt()); + const auto &B2 = + matchNode(declRefExpr(to(varDecl(hasName("b2"))))); + const auto &AndOp = + matchNode(binaryOperator(hasOperatorName("&&"))); + + // The if statement is the terminator for the block that contains both `b2` + // and the `&&` operator (which appears only as a terminator condition, not + // as a regular `CFGElement`). + const CFGBlock *IfBlock = blockForStmt(If); + const CFGBlock *B2Block = blockForStmt(B2); + const CFGBlock *AndOpBlock = blockForStmt(AndOp); + EXPECT_EQ(IfBlock, B2Block); + EXPECT_EQ(IfBlock, AndOpBlock); +} + // Tests that check we discard state for expressions correctly. using DiscardExprStateTest = DataflowAnalysisTest; @@ -144,25 +233,20 @@ TEST_F(DiscardExprStateTest, WhileStatement) { auto BlockStates = llvm::cantFail(runAnalysis( Code, [](ASTContext &C) { return NoopAnalysis(C); })); - auto *NotEqOp = selectFirst( - "op", match(binaryOperator(hasOperatorName("!=")).bind("op"), - AST->getASTContext())); - ASSERT_NE(NotEqOp, nullptr); - - auto *CallFoo = selectFirst( - "call", match(callExpr(callee(functionDecl(hasName("foo")))).bind("call"), - AST->getASTContext())); - ASSERT_NE(CallFoo, nullptr); + const auto &NotEqOp = + matchNode(binaryOperator(hasOperatorName("!="))); + const auto &CallFoo = + matchNode(callExpr(callee(functionDecl(hasName("foo"))))); // In the block that evaluates the expression `p != nullptr`, this expression // is associated with a value. const auto &NotEqOpState = blockStateForStmt(BlockStates, NotEqOp); - EXPECT_NE(NotEqOpState.Env.getValue(*NotEqOp), nullptr); + EXPECT_NE(NotEqOpState.Env.getValue(NotEqOp), nullptr); // In the block that calls `foo(p)`, the value for `p != nullptr` is discarded // because it is not consumed by this block. const auto &CallFooState = blockStateForStmt(BlockStates, CallFoo); - EXPECT_EQ(CallFooState.Env.getValue(*NotEqOp), nullptr); + EXPECT_EQ(CallFooState.Env.getValue(NotEqOp), nullptr); } TEST_F(DiscardExprStateTest, BooleanOperator) { @@ -174,29 +258,24 @@ TEST_F(DiscardExprStateTest, BooleanOperator) { auto BlockStates = llvm::cantFail(runAnalysis( Code, [](ASTContext &C) { return NoopAnalysis(C); })); - auto *AndOp = selectFirst( - "op", match(binaryOperator(hasOperatorName("&&")).bind("op"), - AST->getASTContext())); - ASSERT_NE(AndOp, nullptr); - - auto *Return = selectFirst( - "return", match(returnStmt().bind("return"), AST->getASTContext())); - ASSERT_NE(Return, nullptr); + const auto &AndOp = + matchNode(binaryOperator(hasOperatorName("&&"))); + const auto &Return = matchNode(returnStmt()); // In the block that evaluates the LHS of the `&&` operator, the LHS is // associated with a value, while the right-hand side is not (unsurprisingly, // as it hasn't been evaluated yet). - const auto &LHSState = blockStateForStmt(BlockStates, AndOp->getLHS()); - auto *LHSValue = cast(LHSState.Env.getValue(*AndOp->getLHS())); + const auto &LHSState = blockStateForStmt(BlockStates, *AndOp.getLHS()); + auto *LHSValue = cast(LHSState.Env.getValue(*AndOp.getLHS())); ASSERT_NE(LHSValue, nullptr); - EXPECT_EQ(LHSState.Env.getValue(*AndOp->getRHS()), nullptr); + EXPECT_EQ(LHSState.Env.getValue(*AndOp.getRHS()), nullptr); // In the block that evaluates the RHS, the RHS is associated with a // value. The value for the LHS has been discarded as it is not consumed by // this block. - const auto &RHSState = blockStateForStmt(BlockStates, AndOp->getRHS()); - EXPECT_EQ(RHSState.Env.getValue(*AndOp->getLHS()), nullptr); - auto *RHSValue = cast(RHSState.Env.getValue(*AndOp->getRHS())); + const auto &RHSState = blockStateForStmt(BlockStates, *AndOp.getRHS()); + EXPECT_EQ(RHSState.Env.getValue(*AndOp.getLHS()), nullptr); + auto *RHSValue = cast(RHSState.Env.getValue(*AndOp.getRHS())); ASSERT_NE(RHSValue, nullptr); // In the block that evaluates the return statement, the expression `b1 && b2` @@ -217,9 +296,9 @@ TEST_F(DiscardExprStateTest, BooleanOperator) { // operands, rather than from the environment for the block that contains the // `&&`. const auto &ReturnState = blockStateForStmt(BlockStates, Return); - EXPECT_EQ(ReturnState.Env.getValue(*AndOp->getLHS()), nullptr); - EXPECT_EQ(ReturnState.Env.getValue(*AndOp->getRHS()), nullptr); - EXPECT_EQ(ReturnState.Env.getValue(*AndOp), + EXPECT_EQ(ReturnState.Env.getValue(*AndOp.getLHS()), nullptr); + EXPECT_EQ(ReturnState.Env.getValue(*AndOp.getRHS()), nullptr); + EXPECT_EQ(ReturnState.Env.getValue(AndOp), &ReturnState.Env.makeAnd(*LHSValue, *RHSValue)); } From 8bd327d6fed5a4ae99bdbd039f5503700030cf53 Mon Sep 17 00:00:00 2001 From: Nick Anderson Date: Thu, 22 Feb 2024 00:47:36 -0800 Subject: [PATCH 190/351] [AMDGPU][GlobalISel] Add fdiv / sqrt to rsq combine (#78673) Fixes #64743 --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 8 +- .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 23 + .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir | 584 ++++++++++++++++++ 3 files changed, 614 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b9411e2052120..9218760538dc5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule< [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; +def fdiv_by_sqrt_to_rsq_f16 : GICombineRule< + (defs root:$root), + (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)), + (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root, + [{ return matchFDivSqrtToRsqF16(*${root}); }]), + (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>; def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">; @@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> { + rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index a1c34e92a57f3..82e17ddad851f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -83,6 +83,9 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { matchRcpSqrtToRsq(MachineInstr &MI, std::function &MatchInfo) const; + bool matchFDivSqrtToRsqF16(MachineInstr &MI) const; + void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const; + // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. struct CvtF32UByteMatchInfo { @@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( return false; } +bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16( + MachineInstr &MI) const { + Register Sqrt = MI.getOperand(2).getReg(); + return MRI.hasOneNonDBGUse(Sqrt); +} + +void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16( + MachineInstr &MI, const Register &X) const { + Register Dst = MI.getOperand(0).getReg(); + Register Y = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + uint32_t Flags = MI.getFlags(); + Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy}) + .addUse(X) + .setMIFlags(Flags) + .getReg(0); + B.buildFMul(Dst, RSQ, Y, Flags); + MI.eraseFromParent(); +} + bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { Register SrcReg = MI.getOperand(1).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir new file mode 100644 index 0000000000000..6c5339e36c77f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir @@ -0,0 +1,584 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: rsq_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f16 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16) + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %one:_(s16) = G_FCONSTANT half 1.0 + %rsq:_(s16) = contract G_FDIV %one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: rsq_f16_missing_contract0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f16_missing_contract0 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x + ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00 + ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = G_FSQRT %x + %one:_(s16) = G_FCONSTANT half 1.0 + %rsq:_(s16) = contract G_FDIV %one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: rsq_f16_missing_contract1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f16_missing_contract1 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x + ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00 + ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %one:_(s16) = G_FCONSTANT half 1.0 + %rsq:_(s16) = G_FDIV %one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: neg_rsq_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_f16 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16) + ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]] + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %neg_one:_(s16) = G_FCONSTANT half -1.0 + %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: neg_rsq_f16_missing_contract0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_f16_missing_contract0 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x + ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00 + ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = G_FSQRT %x + %neg_one:_(s16) = G_FCONSTANT half -1.0 + %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: neg_rsq_f16_missing_contract1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_f16_missing_contract1 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x + ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00 + ; GCN-NEXT: %rsq:_(s16) = G_FDIV %neg_one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %neg_one:_(s16) = G_FCONSTANT half -1.0 + %rsq:_(s16) = G_FDIV %neg_one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: rsq_f16_multi_use +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f16_multi_use + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x + ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00 + ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %one:_(s16) = G_FCONSTANT half 1.0 + %rsq:_(s16) = contract G_FDIV %one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + S_ENDPGM 0, implicit %sqrt + +... + +--- +name: rsq_f16_multi_use_missing_contract0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract0 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x + ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00 + ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = G_FSQRT %x + %one:_(s16) = G_FCONSTANT half 1.0 + %rsq:_(s16) = contract G_FDIV %one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + S_ENDPGM 0, implicit %sqrt + +... + +--- +name: rsq_f16_multi_use_missing_contract1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract1 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x + ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00 + ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %one:_(s16) = G_FCONSTANT half 1.0 + %rsq:_(s16) = G_FDIV %one, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + S_ENDPGM 0, implicit %sqrt + +... + +--- +name: rsq_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f32 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %x:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x + ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %one, %sqrt + ; GCN-NEXT: $vgpr0 = COPY %rsq(s32) + %x:_(s32) = COPY $vgpr0 + %sqrt:_(s32) = contract G_FSQRT %x + %one:_(s32) = G_FCONSTANT float 1.0 + %rsq:_(s32) = contract G_FDIV %one, %sqrt + $vgpr0 = COPY %rsq + +... + +--- +name: neg_rsq_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_f32 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %x:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x + ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00 + ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt + ; GCN-NEXT: $vgpr0 = COPY %rsq(s32) + %x:_(s32) = COPY $vgpr0 + %sqrt:_(s32) = contract G_FSQRT %x + %neg_one:_(s32) = G_FCONSTANT float -1.0 + %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt + $vgpr0 = COPY %rsq + +... + +--- +name: afn_rsq_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: afn_rsq_f32 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %x:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x + ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt + ; GCN-NEXT: $vgpr0 = COPY %rsq(s32) + %x:_(s32) = COPY $vgpr0 + %sqrt:_(s32) = contract afn G_FSQRT %x + %one:_(s32) = G_FCONSTANT float 1.0 + %rsq:_(s32) = contract afn G_FDIV %one, %sqrt + $vgpr0 = COPY %rsq + +... + +--- +name: afn_rsq_f32_multi_use +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: afn_rsq_f32_multi_use + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %x:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x + ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt + ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, %rsq + ; GCN-NEXT: $vgpr0 = COPY %ret(s32) + %x:_(s32) = COPY $vgpr0 + %sqrt:_(s32) = contract afn G_FSQRT %x + %one:_(s32) = G_FCONSTANT float 1.0 + %rsq:_(s32) = contract afn G_FDIV %one, %sqrt + %ret:_(s32) = G_FSUB %sqrt, %rsq + $vgpr0 = COPY %ret + +... + +--- +name: afn_neg_rsq_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: afn_neg_rsq_f32 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: %x:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x + ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00 + ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt + ; GCN-NEXT: $vgpr0 = COPY %rsq(s32) + %x:_(s32) = COPY $vgpr0 + %sqrt:_(s32) = contract afn G_FSQRT %x + %neg_one:_(s32) = G_FCONSTANT float -1.0 + %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt + $vgpr0 = COPY %rsq + +... + + +--- +name: rsq_f64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_f64 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x + ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s64) = G_ANYEXT %0:_(s32) + %sqrt:_(s64) = contract G_FSQRT %x + %one:_(s64) = G_FCONSTANT double 1.0 + %rsq:_(s64) = contract G_FDIV %one, %sqrt + %ext:_(s32) = G_TRUNC %rsq:_(s64) + $vgpr0 = COPY %ext + +... + +--- +name: neg_rsq_f64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_f64 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x + ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00 + ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s64) = G_ANYEXT %0:_(s32) + %sqrt:_(s64) = contract G_FSQRT %x + %neg_one:_(s64) = G_FCONSTANT double -1.0 + %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt + %ext:_(s32) = G_TRUNC %rsq:_(s64) + $vgpr0 = COPY %ext + +... + +--- +name: afn_rsq_f64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: afn_rsq_f64 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x + ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s64) = G_ANYEXT %0:_(s32) + %sqrt:_(s64) = contract afn G_FSQRT %x + %one:_(s64) = G_FCONSTANT double 1.0 + %rsq:_(s64) = contract afn G_FDIV %one, %sqrt + %ext:_(s32) = G_TRUNC %rsq:_(s64) + $vgpr0 = COPY %ext + +... + +--- +name: afn_neg_rsq_f64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: afn_neg_rsq_f64 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32) + ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x + ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00 + ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt + ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s64) = G_ANYEXT %0:_(s32) + %sqrt:_(s64) = contract afn G_FSQRT %x + %neg_one:_(s64) = G_FCONSTANT double -1.0 + %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt + %ext:_(s32) = G_TRUNC %rsq:_(s64) + $vgpr0 = COPY %ext + +... + + +--- +name: rsq_fract_num_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_fract_num_f16 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %fract:_(s16) = G_FCONSTANT half 0xH3800 + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16) + ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %fract + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %fract:_(s16) = G_FCONSTANT half 0.5 + %rsq:_(s16) = contract G_FDIV %fract, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: neg_rsq_fract_num_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_fract_num_f16 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %neg_fract:_(s16) = G_FCONSTANT half 0xHB800 + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16) + ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_fract + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %neg_fract:_(s16) = G_FCONSTANT half -0.5 + %rsq:_(s16) = contract G_FDIV %neg_fract, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + + +... + +--- +name: rsq_large_num_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: rsq_large_num_f16 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %ten:_(s16) = G_FCONSTANT half 0xH4900 + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16) + ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %ten + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %ten:_(s16) = G_FCONSTANT half 10.0 + %rsq:_(s16) = contract G_FDIV %ten, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... + +--- +name: neg_rsq_large_num_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: neg_rsq_large_num_f16 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32) + ; GCN-NEXT: %neg_ten:_(s16) = G_FCONSTANT half 0xHC900 + ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16) + ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_ten + ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16) + ; GCN-NEXT: $vgpr0 = COPY %ext(s32) + %0:_(s32) = COPY $vgpr0 + %x:_(s16) = G_TRUNC %0:_(s32) + %sqrt:_(s16) = contract G_FSQRT %x + %neg_ten:_(s16) = G_FCONSTANT half -10.0 + %rsq:_(s16) = contract G_FDIV %neg_ten, %sqrt + %ext:_(s32) = G_ANYEXT %rsq:_(s16) + $vgpr0 = COPY %ext + +... From fde344aef20bc4280f01294ac6e14a5c2db2d572 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Thu, 22 Feb 2024 09:55:50 +0100 Subject: [PATCH 191/351] [mlir][Transforms] Dialect conversion: Improve signature conversion API (#81997) This commit improves the block signature conversion API of the dialect conversion. There is the following comment in `ArgConverter::applySignatureConversion`: ``` // If no arguments are being changed or added, there is nothing to do. ``` However, the implementation actually used to replace a block with a new block even if the block argument types do not change (i.e., there is "nothing to do"). This is fixed in this commit. The documentation of the public `ConversionPatternRewriter` API is updated accordingly. This commit also removes a check that used to *sometimes* skip a block signature conversion if the block was already converted. This is not consistent with the public `ConversionPatternRewriter` API; blocks should always be converted, regardless of whether they were already converted or not. Block signature conversion also used to be silently skipped when the specified block was detached. Instead of silently skipping, an assertion is triggered. Attempting to convert a detached block (which is likely an erased block) is invalid API usage. --- mlir/include/mlir/Transforms/DialectConversion.h | 12 +++++++++--- mlir/lib/Transforms/Utils/DialectConversion.cpp | 10 +++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 0d7722aa07ee3..2575be4cdea1a 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -663,6 +663,8 @@ class ConversionPatternRewriter final : public PatternRewriter { /// Apply a signature conversion to the entry block of the given region. This /// replaces the entry block with a new block containing the updated /// signature. The new entry block to the region is returned for convenience. + /// If no block argument types are changing, the entry original block will be + /// left in place and returned. /// /// If provided, `converter` will be used for any materializations. Block * @@ -671,8 +673,11 @@ class ConversionPatternRewriter final : public PatternRewriter { const TypeConverter *converter = nullptr); /// Convert the types of block arguments within the given region. This - /// replaces each block with a new block containing the updated signature. The - /// entry block may have a special conversion if `entryConversion` is + /// replaces each block with a new block containing the updated signature. If + /// an updated signature would match the current signature, the respective + /// block is left in place as is. + /// + /// The entry block may have a special conversion if `entryConversion` is /// provided. On success, the new entry block to the region is returned for /// convenience. Otherwise, failure is returned. FailureOr convertRegionTypes( @@ -681,7 +686,8 @@ class ConversionPatternRewriter final : public PatternRewriter { /// Convert the types of block arguments within the given region except for /// the entry region. This replaces each non-entry block with a new block - /// containing the updated signature. + /// containing the updated signature. If an updated signature would match the + /// current signature, the respective block is left in place as is. /// /// If special conversion behavior is needed for the non-entry blocks (for /// example, we need to convert only a subset of a BB arguments), such diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 4989ddc3ec94f..afdd31a748c8c 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -544,12 +544,8 @@ FailureOr ArgConverter::convertSignature( Block *block, const TypeConverter *converter, ConversionValueMapping &mapping, SmallVectorImpl &argReplacements) { - // Check if the block was already converted. - // * If the block is mapped in `conversionInfo`, it is a converted block. - // * If the block is detached, conservatively assume that it is going to be - // deleted; it is likely the old block (before it was converted). - if (conversionInfo.count(block) || !block->getParent()) - return block; + assert(block->getParent() && "cannot convert signature of detached block"); + // If a converter wasn't provided, and the block wasn't already converted, // there is nothing we can do. if (!converter) @@ -570,7 +566,7 @@ Block *ArgConverter::applySignatureConversion( // If no arguments are being changed or added, there is nothing to do. unsigned origArgCount = block->getNumArguments(); auto convertedTypes = signatureConversion.getConvertedTypes(); - if (origArgCount == 0 && convertedTypes.empty()) + if (llvm::equal(block->getArgumentTypes(), convertedTypes)) return block; // Split the block at the beginning to get a new block to use for the updated From 25e7e8d993f12f391ad90d23b5c3e2385ebafc81 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Tue, 20 Feb 2024 22:13:46 +0100 Subject: [PATCH 192/351] [CGP] Permit tail call optimization on undefined return value We may freely allow tail call optzs on undef values as well. Fixes: https://github.com/llvm/llvm-project/issues/82387. --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 5 +- llvm/test/CodeGen/AArch64/addsub.ll | 6 +- .../CodeGen/AArch64/callbr-asm-obj-file.ll | 2 +- llvm/test/CodeGen/RISCV/pr51206.ll | 12 ++-- llvm/test/CodeGen/X86/tailcall-cgp-dup.ll | 58 ++++++++++++++++++- 5 files changed, 66 insertions(+), 17 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 4036f18dbc679..feefe87f40636 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2686,8 +2686,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, attributesPermitTailCall(F, CI, RetI, *TLI)) { // Either we return void or the return value must be the first // argument of a known intrinsic or library function. - if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && - V == CI->getArgOperand(0))) { + if (!V || isa(V) || + (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && + V == CI->getArgOperand(0))) { TailCallBBs.push_back(Pred); } } diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll index 1b86fe6c707c8..20215fe914692 100644 --- a/llvm/test/CodeGen/AArch64/addsub.ll +++ b/llvm/test/CodeGen/AArch64/addsub.ll @@ -662,17 +662,13 @@ define dso_local i32 @_extract_crng_crng() { ; CHECK-NEXT: cmn x8, #1272 ; CHECK-NEXT: b.pl .LBB36_3 ; CHECK-NEXT: .LBB36_2: // %if.then -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, primary_crng ; CHECK-NEXT: ldr w8, [x8, :lo12:primary_crng] ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: adrp x8, input_pool ; CHECK-NEXT: add x8, x8, :lo12:input_pool ; CHECK-NEXT: csel x0, xzr, x8, eq -; CHECK-NEXT: bl crng_reseed -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: b crng_reseed ; CHECK-NEXT: .LBB36_3: // %if.end ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll index 94041bf00218c..e601f03d524a4 100644 --- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll +++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll @@ -40,7 +40,7 @@ declare dso_local i32 @g(...) local_unnamed_addr declare dso_local i32 @i(...) local_unnamed_addr ; CHECK-LABEL: : -; CHECK: bl {{.*}} +; CHECK: b {{.*}} ; CHECK-LABEL: <$d.5>: ; CHECK-LABEL: <$x.6>: ; CHECK-NEXT: b {{.*}} diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll index f54031af0de5e..8aa145f6ac5ef 100644 --- a/llvm/test/CodeGen/RISCV/pr51206.ll +++ b/llvm/test/CodeGen/RISCV/pr51206.ll @@ -27,16 +27,12 @@ define signext i32 @wobble() nounwind { ; CHECK-NEXT: lui a2, %hi(global.3) ; CHECK-NEXT: li a3, 5 ; CHECK-NEXT: sw a1, %lo(global.3)(a2) -; CHECK-NEXT: bltu a0, a3, .LBB0_2 -; CHECK-NEXT: # %bb.1: # %bb10 -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: call quux -; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .LBB0_2: # %bb12 +; CHECK-NEXT: bgeu a0, a3, .LBB0_2 +; CHECK-NEXT: # %bb.1: # %bb12 ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: # %bb10 +; CHECK-NEXT: tail quux bb: %tmp = load i8, ptr @global, align 1 %tmp1 = zext i8 %tmp to i32 diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll index 401ed9f7bc5a9..8a9ee60f341c2 100644 --- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll +++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll @@ -339,7 +339,7 @@ return: define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) nounwind { ; CHECK-LABEL: strcpy_illegal_tailc: -; CHECK: ## %bb.0: +; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: testq %rsi, %rsi @@ -351,6 +351,7 @@ define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) ; CHECK-NEXT: movq %rbx, %rax ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq +entry: %cmp = icmp eq i64 %sz, 0 br i1 %cmp, label %return, label %if.then @@ -362,8 +363,63 @@ return: ret ptr %src } +@i = global i32 0, align 4 + +define i32 @undef_tailc() nounwind { +; CHECK-LABEL: undef_tailc: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: cmpl $0, _i(%rip) +; CHECK-NEXT: jne _qux ## TAILCALL +; CHECK-NEXT: ## %bb.1: ## %return +; CHECK-NEXT: retq +entry: + %val = load i32, ptr @i, align 4 + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %return, label %if.then + +if.then: + %rv_unused = tail call i32 @qux() + br label %return + +return: + ret i32 undef +} + +define i32 @undef_and_known_tailc() nounwind { +; CHECK-LABEL: undef_and_known_tailc: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: movl _i(%rip), %eax +; CHECK-NEXT: cmpl $5, %eax +; CHECK-NEXT: je _qux ## TAILCALL +; CHECK-NEXT: ## %bb.1: ## %entry +; CHECK-NEXT: cmpl $2, %eax +; CHECK-NEXT: je _quux ## TAILCALL +; CHECK-NEXT: ## %bb.2: ## %return +; CHECK-NEXT: retq +entry: + %val = load i32, ptr @i, align 4 + switch i32 %val, label %return [ + i32 2, label %case_2 + i32 5, label %case_5 + ] + +case_2: + %rv_unused = tail call i32 @quux() + br label %return + +case_5: + %rv = tail call i32 @qux() + br label %return + +return: + %phi = phi i32 [ undef, %case_2 ], [ %rv, %case_5 ], [ undef, %entry ] + ret i32 %phi +} + declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1) declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) declare noalias ptr @malloc(i64) declare ptr @strcpy(ptr noalias returned writeonly, ptr noalias nocapture readonly) declare ptr @baz(ptr, ptr) +declare i32 @qux() +declare i32 @quux() From c5253aa136ac6ba683b367b2bae0dde1a543d1df Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Thu, 22 Feb 2024 09:19:48 +0000 Subject: [PATCH 193/351] [AArch64] Restore Z-registers before P-registers (#79623) (#82492) This is needed by PR#77665[1] that uses a P-register while restoring Z-registers. The reverse for SVE register restore in the epilogue was added to guarantee performance, but further work was done to improve sve frame restore and besides that the schedule also may change the order of the restore, undoing the reverse restore. This also fix the problem reported in (PR #79623) on Windows with std::reverse and .base(). [1]https://github.com/llvm/llvm-project/pull/77665 --- .../Target/AArch64/AArch64FrameLowering.cpp | 19 ++-- .../framelayout-sve-calleesaves-fix.mir | 2 +- llvm/test/CodeGen/AArch64/framelayout-sve.mir | 24 ++--- .../sme-streaming-compatible-interface.ll | 32 +++---- .../AArch64/sme-streaming-interface.ll | 32 +++---- .../CodeGen/AArch64/sme2-intrinsics-ld1.ll | 32 +++---- .../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll | 32 +++---- .../test/CodeGen/AArch64/stack-probing-sve.ll | 4 +- llvm/test/CodeGen/AArch64/sve-alloca.ll | 16 ++-- .../AArch64/sve-calling-convention-mixed.ll | 32 +++---- llvm/test/CodeGen/AArch64/sve-tailcall.ll | 32 +++---- llvm/test/CodeGen/AArch64/unwind-preserved.ll | 96 +++++++++---------- 12 files changed, 177 insertions(+), 176 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 3485edb69c910..503b1c199650f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3195,11 +3195,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return MIB->getIterator(); }; - // SVE objects are always restored in reverse order. - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (RPI.isScalable()) - EmitMI(RPI); - if (homogeneousPrologEpilog(MF, &MBB)) { auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); @@ -3210,11 +3205,19 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } + // For performance reasons restore SVE register in increasing order + auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; + auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); + auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); + std::reverse(PPRBegin, PPREnd); + auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; + auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); + auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); + std::reverse(ZPRBegin, ZPREnd); + if (ReverseCSRRestoreSeq) { MachineBasicBlock::iterator First = MBB.end(); for (const RegPairInfo &RPI : reverse(RegPairs)) { - if (RPI.isScalable()) - continue; MachineBasicBlock::iterator It = EmitMI(RPI); if (First == MBB.end()) First = It; @@ -3223,8 +3226,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MBB.splice(MBBI, &MBB, First); } else { for (const RegPairInfo &RPI : RegPairs) { - if (RPI.isScalable()) - continue; (void)EmitMI(RPI); } } diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir index 3dba21d59b408..aed3145073619 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir @@ -19,8 +19,8 @@ ; CHECK-NEXT: // implicit-def: $p4 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG - ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload + ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 213d7919e4a72..f7920e595e44b 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -772,9 +772,9 @@ body: | # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 +# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 # CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 -# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 @@ -873,14 +873,14 @@ body: | # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 -# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 -# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 -# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15 # CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2 # CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3 # CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 # CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17 +# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 +# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 +# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 +# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 @@ -1037,14 +1037,14 @@ body: | # CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 +# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3 +# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 # CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15 -# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 -# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3 -# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 -# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10 @@ -1198,10 +1198,10 @@ body: | # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 7 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6 -# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 # CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1 # CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6 +# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 296f2be9cfee5..6d2abf7e18419 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -226,30 +226,30 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_predicate_vectors( @smstart_clobber_sve( %x) nounwind { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload @@ -267,30 +267,30 @@ define @smstart_clobber_sve_duplicate( %x) ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll index b7119fc082567..ea7808d73093e 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -129,7 +129,6 @@ define @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -611,6 +610,7 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -751,7 +751,6 @@ define @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -922,6 +921,7 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1062,7 +1062,6 @@ define @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1233,6 +1232,7 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1380,7 +1380,6 @@ define @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1560,6 +1559,7 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1711,7 +1711,6 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1726,6 +1725,7 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1877,7 +1877,6 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1892,6 +1891,7 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -2043,7 +2043,6 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -2058,6 +2057,7 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -2209,7 +2209,6 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -2224,6 +2223,7 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -2375,7 +2375,6 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -2390,6 +2389,7 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -2541,7 +2541,6 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -2556,6 +2555,7 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll index 1fb251a4f628e..7e2d28fbf7982 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -82,7 +82,6 @@ define @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -206,6 +205,7 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -299,7 +299,6 @@ define @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -423,6 +422,7 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -516,7 +516,6 @@ define @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -640,6 +639,7 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -733,7 +733,6 @@ define @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -857,6 +856,7 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -955,7 +955,6 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -970,6 +969,7 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1071,7 +1071,6 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1086,6 +1085,7 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1188,7 +1188,6 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1203,6 +1202,7 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1304,7 +1304,6 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1319,6 +1318,7 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1421,7 +1421,6 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1436,6 +1435,7 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1537,7 +1537,6 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1552,6 +1551,7 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1654,7 +1654,6 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1669,6 +1668,7 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret @@ -1770,7 +1770,6 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1785,6 +1784,7 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CONTIGUOUS-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll index 1ad78709d5012..56d865ef83e6b 100644 --- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll +++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -380,7 +380,6 @@ define void @sve_16v_1p_csr( %a) #0 { ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload @@ -397,6 +396,7 @@ define void @sve_16v_1p_csr( %a) #0 { ; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #17 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 @@ -697,10 +697,10 @@ define void @sve_unprobed_area( %a, i32 %n) #0 { ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG -; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll index 47e49b84aaaff..d227538043fce 100644 --- a/llvm/test/CodeGen/AArch64/sve-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll @@ -66,30 +66,30 @@ define void @foo( %dst, i1 %cond) { ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: bl bar ; CHECK-NEXT: addvl sp, x29, #-18 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index 9851583b950eb..3965af6a9066d 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -567,30 +567,30 @@ define @sve_caller_non_sve_callee_high_range( @sve_ret_caller_non_sve_callee_high_range() { ; CHECK-NEXT: fmov s7, #7.00000000 ; CHECK-NEXT: bl non_sve_callee_high_range ; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll index f32c80d392b63..4ddf007768fd2 100644 --- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll +++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll @@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee( %arg) nounwind { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: bl non_sve_callee -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload @@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc( %arg) nounwind { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: bl non_sve_callee -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll index f3c4d217e6fca..822be14faaeb1 100644 --- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll +++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll @@ -63,18 +63,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -91,6 +79,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 @@ -112,18 +112,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -140,6 +128,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: .cfi_restore z8 @@ -215,18 +215,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #2 ; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -243,6 +231,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #18 ; GISEL-NEXT: .cfi_def_cfa wsp, 16 ; GISEL-NEXT: .cfi_restore z8 @@ -264,18 +264,6 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #2 ; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG -; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload @@ -292,6 +280,18 @@ define @invoke_callee_may_throw_sve( %v) uw ; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload ; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload ; GISEL-NEXT: addvl sp, sp, #18 ; GISEL-NEXT: .cfi_def_cfa wsp, 16 ; GISEL-NEXT: .cfi_restore z8 From 55558cd05c998f1b287b0af97aa6db0db0bdfaa0 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Thu, 22 Feb 2024 10:22:27 +0100 Subject: [PATCH 194/351] [mlir][Transforms][NFC] Turn block type conversion into `IRRewrite` (#81756) This commit is a refactoring of the dialect conversion. The dialect conversion maintains a list of "IR rewrites" that can be committed (upon success) or rolled back (upon failure). Until now, the signature conversion of a block was only a "partial" IR rewrite. Rollbacks were triggered via `BlockTypeConversionRewrite::rollback`, but there was no `BlockTypeConversionRewrite::commit` equivalent. Overview of changes: * Remove `ArgConverter`, an internal helper class that kept track of all block type conversions. There is now a separate `BlockTypeConversionRewrite` for each block type conversion. * No more special handling for block type conversions. They are now normal "IR rewrites", just like "block creation" or "block movement". In particular, trigger "commits" of block type conversion via `BlockTypeConversionRewrite::commit`. * Remove `ArgConverter::notifyOpRemoved`. This function was used to inform the `ArgConverter` that an operation was erased, to prevent a double-free of operations in certain situations. It would be unpractical to add a `notifyOpRemoved` API to `IRRewrite`. Instead, erasing ops/block should go through a new `SingleEraseRewriter` (that is owned by the `ConversionPatternRewriterImpl`) if there is chance of double-free. This rewriter ignores `eraseOp`/`eraseBlock` if the op/block was already freed. --- .../Transforms/Utils/DialectConversion.cpp | 794 ++++++++---------- 1 file changed, 364 insertions(+), 430 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index afdd31a748c8c..db41b9f19e7e8 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -154,12 +154,13 @@ namespace { struct RewriterState { RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations, unsigned numReplacements, unsigned numArgReplacements, - unsigned numRewrites, unsigned numIgnoredOperations) + unsigned numRewrites, unsigned numIgnoredOperations, + unsigned numErased) : numCreatedOps(numCreatedOps), numUnresolvedMaterializations(numUnresolvedMaterializations), numReplacements(numReplacements), numArgReplacements(numArgReplacements), numRewrites(numRewrites), - numIgnoredOperations(numIgnoredOperations) {} + numIgnoredOperations(numIgnoredOperations), numErased(numErased) {} /// The current number of created operations. unsigned numCreatedOps; @@ -178,6 +179,9 @@ struct RewriterState { /// The current number of ignored operations. unsigned numIgnoredOperations; + + /// The current number of erased operations/blocks. + unsigned numErased; }; //===----------------------------------------------------------------------===// @@ -292,370 +296,6 @@ static Value buildUnresolvedTargetMaterialization( outputType, outputType, converter, unresolvedMaterializations); } -//===----------------------------------------------------------------------===// -// ArgConverter -//===----------------------------------------------------------------------===// -namespace { -/// This class provides a simple interface for converting the types of block -/// arguments. This is done by creating a new block that contains the new legal -/// types and extracting the block that contains the old illegal types to allow -/// for undoing pending rewrites in the case of failure. -struct ArgConverter { - ArgConverter( - PatternRewriter &rewriter, - SmallVectorImpl &unresolvedMaterializations) - : rewriter(rewriter), - unresolvedMaterializations(unresolvedMaterializations) {} - - /// This structure contains the information pertaining to an argument that has - /// been converted. - struct ConvertedArgInfo { - ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize, - Value castValue = nullptr) - : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {} - - /// The start index of in the new argument list that contains arguments that - /// replace the original. - unsigned newArgIdx; - - /// The number of arguments that replaced the original argument. - unsigned newArgSize; - - /// The cast value that was created to cast from the new arguments to the - /// old. This only used if 'newArgSize' > 1. - Value castValue; - }; - - /// This structure contains information pertaining to a block that has had its - /// signature converted. - struct ConvertedBlockInfo { - ConvertedBlockInfo(Block *origBlock, const TypeConverter *converter) - : origBlock(origBlock), converter(converter) {} - - /// The original block that was requested to have its signature converted. - Block *origBlock; - - /// The conversion information for each of the arguments. The information is - /// std::nullopt if the argument was dropped during conversion. - SmallVector, 1> argInfo; - - /// The type converter used to convert the arguments. - const TypeConverter *converter; - }; - - //===--------------------------------------------------------------------===// - // Rewrite Application - //===--------------------------------------------------------------------===// - - /// Erase any rewrites registered for the blocks within the given operation - /// which is about to be removed. This merely drops the rewrites without - /// undoing them. - void notifyOpRemoved(Operation *op); - - /// Cleanup and undo any generated conversions for the arguments of block. - /// This method replaces the new block with the original, reverting the IR to - /// its original state. - void discardRewrites(Block *block); - - /// Fully replace uses of the old arguments with the new. - void applyRewrites(ConversionValueMapping &mapping); - - /// Materialize any necessary conversions for converted arguments that have - /// live users, using the provided `findLiveUser` to search for a user that - /// survives the conversion process. - LogicalResult - materializeLiveConversions(ConversionValueMapping &mapping, - OpBuilder &builder, - function_ref findLiveUser); - - //===--------------------------------------------------------------------===// - // Conversion - //===--------------------------------------------------------------------===// - - /// Attempt to convert the signature of the given block, if successful a new - /// block is returned containing the new arguments. Returns `block` if it did - /// not require conversion. - FailureOr - convertSignature(Block *block, const TypeConverter *converter, - ConversionValueMapping &mapping, - SmallVectorImpl &argReplacements); - - /// Apply the given signature conversion on the given block. The new block - /// containing the updated signature is returned. If no conversions were - /// necessary, e.g. if the block has no arguments, `block` is returned. - /// `converter` is used to generate any necessary cast operations that - /// translate between the origin argument types and those specified in the - /// signature conversion. - Block *applySignatureConversion( - Block *block, const TypeConverter *converter, - TypeConverter::SignatureConversion &signatureConversion, - ConversionValueMapping &mapping, - SmallVectorImpl &argReplacements); - - /// A collection of blocks that have had their arguments converted. This is a - /// map from the new replacement block, back to the original block. - llvm::MapVector conversionInfo; - - /// The pattern rewriter to use when materializing conversions. - PatternRewriter &rewriter; - - /// An ordered set of unresolved materializations during conversion. - SmallVectorImpl &unresolvedMaterializations; -}; -} // namespace - -//===----------------------------------------------------------------------===// -// Rewrite Application - -void ArgConverter::notifyOpRemoved(Operation *op) { - if (conversionInfo.empty()) - return; - - for (Region ®ion : op->getRegions()) { - for (Block &block : region) { - // Drop any rewrites from within. - for (Operation &nestedOp : block) - if (nestedOp.getNumRegions()) - notifyOpRemoved(&nestedOp); - - // Check if this block was converted. - auto *it = conversionInfo.find(&block); - if (it == conversionInfo.end()) - continue; - - // Drop all uses of the original arguments and delete the original block. - Block *origBlock = it->second.origBlock; - for (BlockArgument arg : origBlock->getArguments()) - arg.dropAllUses(); - conversionInfo.erase(it); - } - } -} - -void ArgConverter::discardRewrites(Block *block) { - auto *it = conversionInfo.find(block); - if (it == conversionInfo.end()) - return; - Block *origBlock = it->second.origBlock; - - // Drop all uses of the new block arguments and replace uses of the new block. - for (int i = block->getNumArguments() - 1; i >= 0; --i) - block->getArgument(i).dropAllUses(); - block->replaceAllUsesWith(origBlock); - - // Move the operations back the original block, move the original block back - // into its original location and the delete the new block. - origBlock->getOperations().splice(origBlock->end(), block->getOperations()); - block->getParent()->getBlocks().insert(Region::iterator(block), origBlock); - block->erase(); - - conversionInfo.erase(it); -} - -void ArgConverter::applyRewrites(ConversionValueMapping &mapping) { - for (auto &info : conversionInfo) { - ConvertedBlockInfo &blockInfo = info.second; - Block *origBlock = blockInfo.origBlock; - - // Process the remapping for each of the original arguments. - for (unsigned i = 0, e = origBlock->getNumArguments(); i != e; ++i) { - std::optional &argInfo = blockInfo.argInfo[i]; - BlockArgument origArg = origBlock->getArgument(i); - - // Handle the case of a 1->0 value mapping. - if (!argInfo) { - if (Value newArg = mapping.lookupOrNull(origArg, origArg.getType())) - origArg.replaceAllUsesWith(newArg); - continue; - } - - // Otherwise this is a 1->1+ value mapping. - Value castValue = argInfo->castValue; - assert(argInfo->newArgSize >= 1 && castValue && "expected 1->1+ mapping"); - - // If the argument is still used, replace it with the generated cast. - if (!origArg.use_empty()) { - origArg.replaceAllUsesWith( - mapping.lookupOrDefault(castValue, origArg.getType())); - } - } - - delete origBlock; - blockInfo.origBlock = nullptr; - } -} - -LogicalResult ArgConverter::materializeLiveConversions( - ConversionValueMapping &mapping, OpBuilder &builder, - function_ref findLiveUser) { - for (auto &info : conversionInfo) { - Block *newBlock = info.first; - ConvertedBlockInfo &blockInfo = info.second; - Block *origBlock = blockInfo.origBlock; - - // Process the remapping for each of the original arguments. - for (unsigned i = 0, e = origBlock->getNumArguments(); i != e; ++i) { - // If the type of this argument changed and the argument is still live, we - // need to materialize a conversion. - BlockArgument origArg = origBlock->getArgument(i); - if (mapping.lookupOrNull(origArg, origArg.getType())) - continue; - Operation *liveUser = findLiveUser(origArg); - if (!liveUser) - continue; - - Value replacementValue = mapping.lookupOrDefault(origArg); - bool isDroppedArg = replacementValue == origArg; - if (isDroppedArg) - rewriter.setInsertionPointToStart(newBlock); - else - rewriter.setInsertionPointAfterValue(replacementValue); - Value newArg; - if (blockInfo.converter) { - newArg = blockInfo.converter->materializeSourceConversion( - rewriter, origArg.getLoc(), origArg.getType(), - isDroppedArg ? ValueRange() : ValueRange(replacementValue)); - assert((!newArg || newArg.getType() == origArg.getType()) && - "materialization hook did not provide a value of the expected " - "type"); - } - if (!newArg) { - InFlightDiagnostic diag = - emitError(origArg.getLoc()) - << "failed to materialize conversion for block argument #" << i - << " that remained live after conversion, type was " - << origArg.getType(); - if (!isDroppedArg) - diag << ", with target type " << replacementValue.getType(); - diag.attachNote(liveUser->getLoc()) - << "see existing live user here: " << *liveUser; - return failure(); - } - mapping.map(origArg, newArg); - } - } - return success(); -} - -//===----------------------------------------------------------------------===// -// Conversion - -FailureOr ArgConverter::convertSignature( - Block *block, const TypeConverter *converter, - ConversionValueMapping &mapping, - SmallVectorImpl &argReplacements) { - assert(block->getParent() && "cannot convert signature of detached block"); - - // If a converter wasn't provided, and the block wasn't already converted, - // there is nothing we can do. - if (!converter) - return failure(); - - // Try to convert the signature for the block with the provided converter. - if (auto conversion = converter->convertBlockSignature(block)) - return applySignatureConversion(block, converter, *conversion, mapping, - argReplacements); - return failure(); -} - -Block *ArgConverter::applySignatureConversion( - Block *block, const TypeConverter *converter, - TypeConverter::SignatureConversion &signatureConversion, - ConversionValueMapping &mapping, - SmallVectorImpl &argReplacements) { - // If no arguments are being changed or added, there is nothing to do. - unsigned origArgCount = block->getNumArguments(); - auto convertedTypes = signatureConversion.getConvertedTypes(); - if (llvm::equal(block->getArgumentTypes(), convertedTypes)) - return block; - - // Split the block at the beginning to get a new block to use for the updated - // signature. - Block *newBlock = block->splitBlock(block->begin()); - block->replaceAllUsesWith(newBlock); - // Unlink the block, but do not erase it yet, so that the change can be rolled - // back. - block->getParent()->getBlocks().remove(block); - - // Map all new arguments to the location of the argument they originate from. - SmallVector newLocs(convertedTypes.size(), - rewriter.getUnknownLoc()); - for (unsigned i = 0; i < origArgCount; ++i) { - auto inputMap = signatureConversion.getInputMapping(i); - if (!inputMap || inputMap->replacementValue) - continue; - Location origLoc = block->getArgument(i).getLoc(); - for (unsigned j = 0; j < inputMap->size; ++j) - newLocs[inputMap->inputNo + j] = origLoc; - } - - SmallVector newArgRange( - newBlock->addArguments(convertedTypes, newLocs)); - ArrayRef newArgs(newArgRange); - - // Remap each of the original arguments as determined by the signature - // conversion. - ConvertedBlockInfo info(block, converter); - info.argInfo.resize(origArgCount); - - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToStart(newBlock); - for (unsigned i = 0; i != origArgCount; ++i) { - auto inputMap = signatureConversion.getInputMapping(i); - if (!inputMap) - continue; - BlockArgument origArg = block->getArgument(i); - - // If inputMap->replacementValue is not nullptr, then the argument is - // dropped and a replacement value is provided to be the remappedValue. - if (inputMap->replacementValue) { - assert(inputMap->size == 0 && - "invalid to provide a replacement value when the argument isn't " - "dropped"); - mapping.map(origArg, inputMap->replacementValue); - argReplacements.push_back(origArg); - continue; - } - - // Otherwise, this is a 1->1+ mapping. - auto replArgs = newArgs.slice(inputMap->inputNo, inputMap->size); - Value newArg; - - // If this is a 1->1 mapping and the types of new and replacement arguments - // match (i.e. it's an identity map), then the argument is mapped to its - // original type. - // FIXME: We simply pass through the replacement argument if there wasn't a - // converter, which isn't great as it allows implicit type conversions to - // appear. We should properly restructure this code to handle cases where a - // converter isn't provided and also to properly handle the case where an - // argument materialization is actually a temporary source materialization - // (e.g. in the case of 1->N). - if (replArgs.size() == 1 && - (!converter || replArgs[0].getType() == origArg.getType())) { - newArg = replArgs.front(); - } else { - Type origOutputType = origArg.getType(); - - // Legalize the argument output type. - Type outputType = origOutputType; - if (Type legalOutputType = converter->convertType(outputType)) - outputType = legalOutputType; - - newArg = buildUnresolvedArgumentMaterialization( - rewriter, origArg.getLoc(), replArgs, origOutputType, outputType, - converter, unresolvedMaterializations); - } - - mapping.map(origArg, newArg); - argReplacements.push_back(origArg); - info.argInfo[i] = - ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg); - } - - conversionInfo.insert({newBlock, std::move(info)}); - return newBlock; -} - //===----------------------------------------------------------------------===// // IR rewrites //===----------------------------------------------------------------------===// @@ -702,6 +342,12 @@ class IRRewrite { IRRewrite(Kind kind, ConversionPatternRewriterImpl &rewriterImpl) : kind(kind), rewriterImpl(rewriterImpl) {} + /// Erase the given op (unless it was already erased). + void eraseOp(Operation *op); + + /// Erase the given block (unless it was already erased). + void eraseBlock(Block *block); + const Kind kind; ConversionPatternRewriterImpl &rewriterImpl; }; @@ -744,8 +390,7 @@ class CreateBlockRewrite : public BlockRewrite { auto &blockOps = block->getOperations(); while (!blockOps.empty()) blockOps.remove(blockOps.begin()); - block->dropAllDefinedValueUses(); - block->erase(); + eraseBlock(block); } }; @@ -881,8 +526,7 @@ class SplitBlockRewrite : public BlockRewrite { // Merge back the block that was split out. originalBlock->getOperations().splice(originalBlock->end(), block->getOperations()); - block->dropAllDefinedValueUses(); - block->erase(); + eraseBlock(block); } private: @@ -890,20 +534,59 @@ class SplitBlockRewrite : public BlockRewrite { Block *originalBlock; }; +/// This structure contains the information pertaining to an argument that has +/// been converted. +struct ConvertedArgInfo { + ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize, + Value castValue = nullptr) + : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {} + + /// The start index of in the new argument list that contains arguments that + /// replace the original. + unsigned newArgIdx; + + /// The number of arguments that replaced the original argument. + unsigned newArgSize; + + /// The cast value that was created to cast from the new arguments to the + /// old. This only used if 'newArgSize' > 1. + Value castValue; +}; + /// Block type conversion. This rewrite is partially reflected in the IR. class BlockTypeConversionRewrite : public BlockRewrite { public: - BlockTypeConversionRewrite(ConversionPatternRewriterImpl &rewriterImpl, - Block *block) - : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block) {} + BlockTypeConversionRewrite( + ConversionPatternRewriterImpl &rewriterImpl, Block *block, + Block *origBlock, SmallVector, 1> argInfo, + const TypeConverter *converter) + : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block), + origBlock(origBlock), argInfo(argInfo), converter(converter) {} static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() == Kind::BlockTypeConversion; } - // TODO: Block type conversions are currently committed in - // `ArgConverter::applyRewrites`. This should be done in the "commit" method. + /// Materialize any necessary conversions for converted arguments that have + /// live users, using the provided `findLiveUser` to search for a user that + /// survives the conversion process. + LogicalResult + materializeLiveConversions(function_ref findLiveUser); + + void commit() override; + void rollback() override; + +private: + /// The original block that was requested to have its signature converted. + Block *origBlock; + + /// The conversion information for each of the arguments. The information is + /// std::nullopt if the argument was dropped during conversion. + SmallVector, 1> argInfo; + + /// The type converter used to convert the arguments. + const TypeConverter *converter; }; /// An operation rewrite. @@ -949,8 +632,8 @@ class MoveOperationRewrite : public OperationRewrite { // The block in which this operation was previously contained. Block *block; - // The original successor of this operation before it was moved. "nullptr" if - // this operation was the only operation in the region. + // The original successor of this operation before it was moved. "nullptr" + // if this operation was the only operation in the region. Operation *insertBeforeOp; }; @@ -1027,6 +710,26 @@ static bool hasRewrite(R &&rewrites, Operation *op) { }); } +/// Find the single rewrite object of the specified type and block among the +/// given rewrites. In debug mode, asserts that there is mo more than one such +/// object. Return "nullptr" if no object was found. +template +static RewriteTy *findSingleRewrite(R &&rewrites, Block *block) { + RewriteTy *result = nullptr; + for (auto &rewrite : rewrites) { + auto *rewriteTy = dyn_cast(rewrite.get()); + if (rewriteTy && rewriteTy->getBlock() == block) { +#ifndef NDEBUG + assert(!result && "expected single matching rewrite"); + result = rewriteTy; +#else + return rewriteTy; +#endif // NDEBUG + } + } + return result; +} + //===----------------------------------------------------------------------===// // ConversionPatternRewriterImpl //===----------------------------------------------------------------------===// @@ -1034,7 +737,7 @@ namespace mlir { namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter) - : argConverter(rewriter, unresolvedMaterializations), + : rewriter(rewriter), eraseRewriter(rewriter.getContext()), notifyCallback(nullptr) {} /// Cleanup and destroy any generated rewrite operations. This method is @@ -1084,15 +787,33 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// removes them from being considered for legalization. void markNestedOpsIgnored(Operation *op); + /// Detach any operations nested in the given operation from their parent + /// blocks, and erase the given operation. This can be used when the nested + /// operations are scheduled for erasure themselves, so deleting the regions + /// of the given operation together with their content would result in + /// double-free. This happens, for example, when rolling back op creation in + /// the reverse order and if the nested ops were created before the parent op. + /// This function does not need to collect nested ops recursively because it + /// is expected to also be called for each nested op when it is about to be + /// deleted. + void detachNestedAndErase(Operation *op); + //===--------------------------------------------------------------------===// // Type Conversion //===--------------------------------------------------------------------===// - /// Convert the signature of the given block. + /// Attempt to convert the signature of the given block, if successful a new + /// block is returned containing the new arguments. Returns `block` if it did + /// not require conversion. FailureOr convertBlockSignature( Block *block, const TypeConverter *converter, TypeConverter::SignatureConversion *conversion = nullptr); + /// Convert the types of non-entry block arguments within the given region. + LogicalResult convertNonEntryRegionTypes( + Region *region, const TypeConverter &converter, + ArrayRef blockConversions = {}); + /// Apply a signature conversion on the given region, using `converter` for /// materializations if not null. Block * @@ -1105,10 +826,15 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { convertRegionTypes(Region *region, const TypeConverter &converter, TypeConverter::SignatureConversion *entryConversion); - /// Convert the types of non-entry block arguments within the given region. - LogicalResult convertNonEntryRegionTypes( - Region *region, const TypeConverter &converter, - ArrayRef blockConversions = {}); + /// Apply the given signature conversion on the given block. The new block + /// containing the updated signature is returned. If no conversions were + /// necessary, e.g. if the block has no arguments, `block` is returned. + /// `converter` is used to generate any necessary cast operations that + /// translate between the origin argument types and those specified in the + /// signature conversion. + Block *applySignatureConversion( + Block *block, const TypeConverter *converter, + TypeConverter::SignatureConversion &signatureConversion); //===--------------------------------------------------------------------===// // Rewriter Notification Hooks @@ -1140,17 +866,54 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { notifyMatchFailure(Location loc, function_ref reasonCallback) override; + //===--------------------------------------------------------------------===// + // IR Erasure + //===--------------------------------------------------------------------===// + + /// A rewriter that keeps track of erased ops and blocks. It ensures that no + /// operation or block is erased multiple times. This rewriter assumes that + /// no new IR is created between calls to `eraseOp`/`eraseBlock`. + struct SingleEraseRewriter : public RewriterBase, RewriterBase::Listener { + public: + SingleEraseRewriter(MLIRContext *context) + : RewriterBase(context, /*listener=*/this) {} + + /// Erase the given op (unless it was already erased). + void eraseOp(Operation *op) override { + if (erased.contains(op)) + return; + op->dropAllUses(); + RewriterBase::eraseOp(op); + } + + /// Erase the given block (unless it was already erased). + void eraseBlock(Block *block) override { + if (erased.contains(block)) + return; + block->dropAllDefinedValueUses(); + RewriterBase::eraseBlock(block); + } + + void notifyOperationErased(Operation *op) override { erased.insert(op); } + void notifyBlockErased(Block *block) override { erased.insert(block); } + + /// Pointers to all erased operations and blocks. + SetVector erased; + }; + //===--------------------------------------------------------------------===// // State //===--------------------------------------------------------------------===// + PatternRewriter &rewriter; + + /// This rewriter must be used for erasing ops/blocks. + SingleEraseRewriter eraseRewriter; + // Mapping between replaced values that differ in type. This happens when // replacing a value with one of a different type. ConversionValueMapping mapping; - /// Utility used to convert block arguments. - ArgConverter argConverter; - /// Ordered vector of all of the newly created operations during conversion. SmallVector createdOps; @@ -1207,20 +970,100 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { } // namespace detail } // namespace mlir +void IRRewrite::eraseOp(Operation *op) { + rewriterImpl.eraseRewriter.eraseOp(op); +} + +void IRRewrite::eraseBlock(Block *block) { + rewriterImpl.eraseRewriter.eraseBlock(block); +} + +void BlockTypeConversionRewrite::commit() { + // Process the remapping for each of the original arguments. + for (auto [origArg, info] : + llvm::zip_equal(origBlock->getArguments(), argInfo)) { + // Handle the case of a 1->0 value mapping. + if (!info) { + if (Value newArg = + rewriterImpl.mapping.lookupOrNull(origArg, origArg.getType())) + origArg.replaceAllUsesWith(newArg); + continue; + } + + // Otherwise this is a 1->1+ value mapping. + Value castValue = info->castValue; + assert(info->newArgSize >= 1 && castValue && "expected 1->1+ mapping"); + + // If the argument is still used, replace it with the generated cast. + if (!origArg.use_empty()) { + origArg.replaceAllUsesWith( + rewriterImpl.mapping.lookupOrDefault(castValue, origArg.getType())); + } + } + + delete origBlock; + origBlock = nullptr; +} + void BlockTypeConversionRewrite::rollback() { - // Undo the type conversion. - rewriterImpl.argConverter.discardRewrites(block); -} - -/// Detach any operations nested in the given operation from their parent -/// blocks, and erase the given operation. This can be used when the nested -/// operations are scheduled for erasure themselves, so deleting the regions of -/// the given operation together with their content would result in double-free. -/// This happens, for example, when rolling back op creation in the reverse -/// order and if the nested ops were created before the parent op. This function -/// does not need to collect nested ops recursively because it is expected to -/// also be called for each nested op when it is about to be deleted. -static void detachNestedAndErase(Operation *op) { + // Drop all uses of the new block arguments and replace uses of the new block. + for (int i = block->getNumArguments() - 1; i >= 0; --i) + block->getArgument(i).dropAllUses(); + block->replaceAllUsesWith(origBlock); + + // Move the operations back the original block, move the original block back + // into its original location and the delete the new block. + origBlock->getOperations().splice(origBlock->end(), block->getOperations()); + block->getParent()->getBlocks().insert(Region::iterator(block), origBlock); + eraseBlock(block); +} + +LogicalResult BlockTypeConversionRewrite::materializeLiveConversions( + function_ref findLiveUser) { + // Process the remapping for each of the original arguments. + for (auto it : llvm::enumerate(origBlock->getArguments())) { + // If the type of this argument changed and the argument is still live, we + // need to materialize a conversion. + BlockArgument origArg = it.value(); + if (rewriterImpl.mapping.lookupOrNull(origArg, origArg.getType())) + continue; + Operation *liveUser = findLiveUser(origArg); + if (!liveUser) + continue; + + Value replacementValue = rewriterImpl.mapping.lookupOrDefault(origArg); + bool isDroppedArg = replacementValue == origArg; + if (isDroppedArg) + rewriterImpl.rewriter.setInsertionPointToStart(getBlock()); + else + rewriterImpl.rewriter.setInsertionPointAfterValue(replacementValue); + Value newArg; + if (converter) { + newArg = converter->materializeSourceConversion( + rewriterImpl.rewriter, origArg.getLoc(), origArg.getType(), + isDroppedArg ? ValueRange() : ValueRange(replacementValue)); + assert((!newArg || newArg.getType() == origArg.getType()) && + "materialization hook did not provide a value of the expected " + "type"); + } + if (!newArg) { + InFlightDiagnostic diag = + emitError(origArg.getLoc()) + << "failed to materialize conversion for block argument #" + << it.index() << " that remained live after conversion, type was " + << origArg.getType(); + if (!isDroppedArg) + diag << ", with target type " << replacementValue.getType(); + diag.attachNote(liveUser->getLoc()) + << "see existing live user here: " << *liveUser; + return failure(); + } + rewriterImpl.mapping.map(origArg, newArg); + } + return success(); +} + +void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) { for (Region ®ion : op->getRegions()) { for (Block &block : region.getBlocks()) { while (!block.getOperations().empty()) @@ -1228,8 +1071,7 @@ static void detachNestedAndErase(Operation *op) { block.dropAllDefinedValueUses(); } } - op->dropAllUses(); - op->erase(); + eraseRewriter.eraseOp(op); } void ConversionPatternRewriterImpl::discardRewrites() { @@ -1248,11 +1090,6 @@ void ConversionPatternRewriterImpl::applyRewrites() { for (OpResult result : repl.first->getResults()) if (Value newValue = mapping.lookupOrNull(result, result.getType())) result.replaceAllUsesWith(newValue); - - // If this operation defines any regions, drop any pending argument - // rewrites. - if (repl.first->getNumRegions()) - argConverter.notifyOpRemoved(repl.first); } // Apply all of the requested argument replacements. @@ -1279,22 +1116,16 @@ void ConversionPatternRewriterImpl::applyRewrites() { // Drop all of the unresolved materialization operations created during // conversion. - for (auto &mat : unresolvedMaterializations) { - mat.getOp()->dropAllUses(); - mat.getOp()->erase(); - } + for (auto &mat : unresolvedMaterializations) + eraseRewriter.eraseOp(mat.getOp()); // In a second pass, erase all of the replaced operations in reverse. This // allows processing nested operations before their parent region is // destroyed. Because we process in reverse order, producers may be deleted // before their users (a pattern deleting a producer and then the consumer) // so we first drop all uses explicitly. - for (auto &repl : llvm::reverse(replacements)) { - repl.first->dropAllUses(); - repl.first->erase(); - } - - argConverter.applyRewrites(mapping); + for (auto &repl : llvm::reverse(replacements)) + eraseRewriter.eraseOp(repl.first); // Commit all rewrites. for (auto &rewrite : rewrites) @@ -1307,7 +1138,8 @@ void ConversionPatternRewriterImpl::applyRewrites() { RewriterState ConversionPatternRewriterImpl::getCurrentState() { return RewriterState(createdOps.size(), unresolvedMaterializations.size(), replacements.size(), argReplacements.size(), - rewrites.size(), ignoredOps.size()); + rewrites.size(), ignoredOps.size(), + eraseRewriter.erased.size()); } void ConversionPatternRewriterImpl::resetState(RewriterState state) { @@ -1355,6 +1187,9 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) { while (!operationsWithChangedResults.empty() && operationsWithChangedResults.back() >= state.numReplacements) operationsWithChangedResults.pop_back(); + + while (eraseRewriter.erased.size() != state.numErased) + eraseRewriter.erased.pop_back(); } void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) { @@ -1443,18 +1278,18 @@ void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) { FailureOr ConversionPatternRewriterImpl::convertBlockSignature( Block *block, const TypeConverter *converter, TypeConverter::SignatureConversion *conversion) { - FailureOr result = - conversion ? argConverter.applySignatureConversion( - block, converter, *conversion, mapping, argReplacements) - : argConverter.convertSignature(block, converter, mapping, - argReplacements); - if (failed(result)) + if (conversion) + return applySignatureConversion(block, converter, *conversion); + + // If a converter wasn't provided, and the block wasn't already converted, + // there is nothing we can do. + if (!converter) return failure(); - if (Block *newBlock = *result) { - if (newBlock != block) - appendRewrite(newBlock); - } - return result; + + // Try to convert the signature for the block with the provided converter. + if (auto conversion = converter->convertBlockSignature(block)) + return applySignatureConversion(block, converter, *conversion); + return failure(); } Block *ConversionPatternRewriterImpl::applySignatureConversion( @@ -1508,6 +1343,102 @@ LogicalResult ConversionPatternRewriterImpl::convertNonEntryRegionTypes( return success(); } +Block *ConversionPatternRewriterImpl::applySignatureConversion( + Block *block, const TypeConverter *converter, + TypeConverter::SignatureConversion &signatureConversion) { + // If no arguments are being changed or added, there is nothing to do. + unsigned origArgCount = block->getNumArguments(); + auto convertedTypes = signatureConversion.getConvertedTypes(); + if (llvm::equal(block->getArgumentTypes(), convertedTypes)) + return block; + + // Split the block at the beginning to get a new block to use for the updated + // signature. + Block *newBlock = block->splitBlock(block->begin()); + block->replaceAllUsesWith(newBlock); + // Unlink the block, but do not erase it yet, so that the change can be rolled + // back. + block->getParent()->getBlocks().remove(block); + + // Map all new arguments to the location of the argument they originate from. + SmallVector newLocs(convertedTypes.size(), + rewriter.getUnknownLoc()); + for (unsigned i = 0; i < origArgCount; ++i) { + auto inputMap = signatureConversion.getInputMapping(i); + if (!inputMap || inputMap->replacementValue) + continue; + Location origLoc = block->getArgument(i).getLoc(); + for (unsigned j = 0; j < inputMap->size; ++j) + newLocs[inputMap->inputNo + j] = origLoc; + } + + SmallVector newArgRange( + newBlock->addArguments(convertedTypes, newLocs)); + ArrayRef newArgs(newArgRange); + + // Remap each of the original arguments as determined by the signature + // conversion. + SmallVector, 1> argInfo; + argInfo.resize(origArgCount); + + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(newBlock); + for (unsigned i = 0; i != origArgCount; ++i) { + auto inputMap = signatureConversion.getInputMapping(i); + if (!inputMap) + continue; + BlockArgument origArg = block->getArgument(i); + + // If inputMap->replacementValue is not nullptr, then the argument is + // dropped and a replacement value is provided to be the remappedValue. + if (inputMap->replacementValue) { + assert(inputMap->size == 0 && + "invalid to provide a replacement value when the argument isn't " + "dropped"); + mapping.map(origArg, inputMap->replacementValue); + argReplacements.push_back(origArg); + continue; + } + + // Otherwise, this is a 1->1+ mapping. + auto replArgs = newArgs.slice(inputMap->inputNo, inputMap->size); + Value newArg; + + // If this is a 1->1 mapping and the types of new and replacement arguments + // match (i.e. it's an identity map), then the argument is mapped to its + // original type. + // FIXME: We simply pass through the replacement argument if there wasn't a + // converter, which isn't great as it allows implicit type conversions to + // appear. We should properly restructure this code to handle cases where a + // converter isn't provided and also to properly handle the case where an + // argument materialization is actually a temporary source materialization + // (e.g. in the case of 1->N). + if (replArgs.size() == 1 && + (!converter || replArgs[0].getType() == origArg.getType())) { + newArg = replArgs.front(); + } else { + Type origOutputType = origArg.getType(); + + // Legalize the argument output type. + Type outputType = origOutputType; + if (Type legalOutputType = converter->convertType(outputType)) + outputType = legalOutputType; + + newArg = buildUnresolvedArgumentMaterialization( + rewriter, origArg.getLoc(), replArgs, origOutputType, outputType, + converter, unresolvedMaterializations); + } + + mapping.map(origArg, newArg); + argReplacements.push_back(origArg); + argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg); + } + + appendRewrite(newBlock, block, argInfo, + converter); + return newBlock; +} + //===----------------------------------------------------------------------===// // Rewriter Notification Hooks @@ -2635,8 +2566,11 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes( }); return liveUserIt == val.user_end() ? nullptr : *liveUserIt; }; - return rewriterImpl.argConverter.materializeLiveConversions( - rewriterImpl.mapping, rewriter, findLiveUser); + for (auto &r : rewriterImpl.rewrites) + if (auto *rewrite = dyn_cast(r.get())) + if (failed(rewrite->materializeLiveConversions(findLiveUser))) + return failure(); + return success(); } /// Replace the results of a materialization operation with the given values. From fddf23c6f4478fc39b0077538d288082f983ce80 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy <89994100+VyacheslavLevytskyy@users.noreply.github.com> Date: Thu, 22 Feb 2024 10:27:59 +0100 Subject: [PATCH 195/351] [SPIRV] Add support for the SPV_KHR_subgroup_rotate extension (#82374) This PR adds support for the SPV_KHR_subgroup_rotate extension that enables rotating values across invocations within a subgroup: * https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/KHR/SPV_KHR_subgroup_rotate.asciidoc --- llvm/lib/Target/SPIRV/SPIRVBuiltins.td | 7 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 5 + llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 9 + llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp | 4 + .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 1 + .../subgroup-rotate.ll | 357 ++++++++++++++++++ 6 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index e6e3560d02f58..28a63b93b43b6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -619,7 +619,8 @@ class GroupBuiltin { !eq(operation, OpGroupNonUniformShuffleDown), !eq(operation, OpGroupBroadcast), !eq(operation, OpGroupNonUniformBroadcast), - !eq(operation, OpGroupNonUniformBroadcastFirst)); + !eq(operation, OpGroupNonUniformBroadcastFirst), + !eq(operation, OpGroupNonUniformRotateKHR)); bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical); } @@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>; defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>; +// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate +defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>; +defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>; + // cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>; defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 0f11bc34d176f..86f65b6320d53 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -765,6 +765,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>; def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>; def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>; +// SPV_KHR_subgroup_rotate +def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res), + (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops), + "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">; + // 3.49.7, Constant-Creation Instructions // - SPV_INTEL_function_pointers diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index dbda2871e153d..9b9575b987994 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1069,6 +1069,15 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL); } break; + case SPIRV::OpGroupNonUniformRotateKHR: + if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate)) + report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the " + "following SPIR-V extension: SPV_KHR_subgroup_rotate", + false); + Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate); + Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR); + Reqs.addCapability(SPIRV::Capability::GroupNonUniform); + break; case SPIRV::OpGroupIMulKHR: case SPIRV::OpGroupFMulKHR: case SPIRV::OpGroupBitwiseAndKHR: diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index e186154aa408b..4694363614ef6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -75,6 +75,10 @@ cl::list Extensions( "Allows to use the LinkOnceODR linkage type that is to let " "a function or global variable to be merged with other functions " "or global variables of the same name when linkage occurs."), + clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate, + "SPV_KHR_subgroup_rotate", + "Adds a new instruction that enables rotating values across " + "invocations within a subgroup."), clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers, "SPV_INTEL_function_pointers", "Allows translation of function pointers."))); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 4e5ac0d531b2d..6c36087baa85e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions], defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>; defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>; defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>; +defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>; defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>; defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>; diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll new file mode 100644 index 0000000000000..b1d6a09c7fe35 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll @@ -0,0 +1,357 @@ +; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: OpGroupNonUniformRotateKHR instruction requires the following SPIR-V extension: SPV_KHR_subgroup_rotate + +; CHECK: OpCapability GroupNonUniformRotateKHR +; CHECK: OpExtension "SPV_KHR_subgroup_rotate" + +; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0 +; CHECK-DAG: %[[TyInt16:.*]] = OpTypeInt 16 0 +; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0 +; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0 +; CHECK-DAG: %[[TyFloat:.*]] = OpTypeFloat 32 +; CHECK-DAG: %[[TyHalf:.*]] = OpTypeFloat 16 +; CHECK-DAG: %[[TyDouble:.*]] = OpTypeFloat 64 +; CHECK-DAG: %[[ScopeSubgroup:.*]] = OpConstant %[[TyInt32]] 3 +; CHECK-DAG: %[[ConstInt2:.*]] = OpConstant %[[TyInt32]] 2 +; CHECK-DAG: %[[ConstInt4:.*]] = OpConstant %[[TyInt32]] 4 + +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir" + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i8, align 1 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i8 0, ptr %v, align 1 + %value = load i8, ptr %v, align 1 +; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0 + store i8 %call, ptr addrspace(1) %arrayidx, align 1 + %value_clustered = load i8, ptr %v, align 1 +; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1 + store i8 %call1, ptr addrspace(1) %arrayidx2, align 1 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateUChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i8, align 1 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i8 0, ptr %v, align 1 + %value = load i8, ptr %v, align 1 +; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0 + store i8 %call, ptr addrspace(1) %arrayidx, align 1 + %value_clustered = load i8, ptr %v, align 1 +; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1 + store i8 %call1, ptr addrspace(1) %arrayidx2, align 1 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i16, align 2 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i16 0, ptr %v, align 2 + %value = load i16, ptr %v, align 2 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0 + store i16 %call, ptr addrspace(1) %arrayidx, align 2 + %value_clustered = load i16, ptr %v, align 2 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1 + store i16 %call1, ptr addrspace(1) %arrayidx2, align 2 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateUShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i16, align 2 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i16 0, ptr %v, align 2 + %value = load i16, ptr %v, align 2 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0 + store i16 %call, ptr addrspace(1) %arrayidx, align 2 + %value_clustered = load i16, ptr %v, align 2 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1 + store i16 %call1, ptr addrspace(1) %arrayidx2, align 2 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i32, align 4 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i32 0, ptr %v, align 4 + %value = load i32, ptr %v, align 4 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func i32 @_Z16sub_group_rotateii(i32 noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0 + store i32 %call, ptr addrspace(1) %arrayidx, align 4 + %value_clustered = load i32, ptr %v, align 4 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1 + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func i32 @_Z16sub_group_rotateii(i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateUInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i32, align 4 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i32 0, ptr %v, align 4 + %value = load i32, ptr %v, align 4 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func i32 @_Z16sub_group_rotateji(i32 noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0 + store i32 %call, ptr addrspace(1) %arrayidx, align 4 + %value_clustered = load i32, ptr %v, align 4 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1 + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func i32 @_Z16sub_group_rotateji(i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateLong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i64, align 8 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i64 0, ptr %v, align 8 + %value = load i64, ptr %v, align 8 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func i64 @_Z16sub_group_rotateli(i64 noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0 + store i64 %call, ptr addrspace(1) %arrayidx, align 8 + %value_clustered = load i64, ptr %v, align 8 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1 + store i64 %call1, ptr addrspace(1) %arrayidx2, align 8 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func i64 @_Z16sub_group_rotateli(i64 noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateULong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca i64, align 8 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store i64 0, ptr %v, align 8 + %value = load i64, ptr %v, align 8 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func i64 @_Z16sub_group_rotatemi(i64 noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0 + store i64 %call, ptr addrspace(1) %arrayidx, align 8 + %value_clustered = load i64, ptr %v, align 8 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1 + store i64 %call1, ptr addrspace(1) %arrayidx2, align 8 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func i64 @_Z16sub_group_rotatemi(i64 noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateFloat(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca float, align 4 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store float 0.000000e+00, ptr %v, align 4 + %value = load float, ptr %v, align 4 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func float @_Z16sub_group_rotatefi(float noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %data, i32 0 + store float %call, ptr addrspace(1) %arrayidx, align 4 + %value_clustered = load float, ptr %v, align 4 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func float @_Z26sub_group_clustered_rotatefij(float noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %data2, i32 1 + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func float @_Z16sub_group_rotatefi(float noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func float @_Z26sub_group_clustered_rotatefij(float noundef, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateHalf(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca half, align 2 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store half 0xH0000, ptr %v, align 2 + %value = load half, ptr %v, align 2 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func half @_Z16sub_group_rotateDhi(half noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds half, ptr addrspace(1) %data, i32 0 + store half %call, ptr addrspace(1) %arrayidx, align 2 + %value_clustered = load half, ptr %v, align 2 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds half, ptr addrspace(1) %data2, i32 1 + store half %call1, ptr addrspace(1) %arrayidx2, align 2 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func half @_Z16sub_group_rotateDhi(half noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef, i32 noundef, i32 noundef) #1 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define dso_local spir_kernel void @testRotateDouble(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !6 { +entry: + %dst.addr = alloca ptr addrspace(1), align 4 + %v = alloca double, align 8 + store ptr addrspace(1) %dst, ptr %dst.addr, align 4 + store double 0.000000e+00, ptr %v, align 8 + %value = load double, ptr %v, align 8 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] + %call = call spir_func double @_Z16sub_group_rotatedi(double noundef %value, i32 noundef 2) #2 + %data = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx = getelementptr inbounds double, ptr addrspace(1) %data, i32 0 + store double %call, ptr addrspace(1) %arrayidx, align 8 + %value_clustered = load double, ptr %v, align 8 + ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]] + %call1 = call spir_func double @_Z26sub_group_clustered_rotatedij(double noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2 + %data2 = load ptr addrspace(1), ptr %dst.addr, align 4 + %arrayidx2 = getelementptr inbounds double, ptr addrspace(1) %data2, i32 1 + store double %call1, ptr addrspace(1) %arrayidx2, align 8 + ret void +} + +; Function Attrs: convergent nounwind +declare spir_func double @_Z16sub_group_rotatedi(double noundef, i32 noundef) #1 + +; Function Attrs: convergent nounwind +declare spir_func double @_Z26sub_group_clustered_rotatedij(double noundef, i32 noundef, i32 noundef) #1 + +attributes #0 = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #2 = { convergent nounwind } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 2, i32 0} +!2 = !{!"clang version 19.0.0"} +!3 = !{i32 1} +!4 = !{!"none"} +!5 = !{!"char*"} +!6 = !{!""} +!7 = !{!"uchar*"} +!8 = !{!"short*"} +!9 = !{!"ushort*"} +!10 = !{!"int*"} +!11 = !{!"uint*"} +!12 = !{!"long*"} +!13 = !{!"ulong*"} +!14 = !{!"float*"} +!15 = !{!"half*"} +!16 = !{!"double*"} From 6cca23a3b91e12c0b6639449bc1e5eb564067db3 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy <89994100+VyacheslavLevytskyy@users.noreply.github.com> Date: Thu, 22 Feb 2024 10:30:00 +0100 Subject: [PATCH 196/351] [SPIRV] Prevent creation of jump tables from switch (#82287) This PR is to prevent creation of jump tables from switch. The reason is that SPIR-V doesn't know how to lower jump tables, and a sequence of commands that IRTranslator generates for switch via jump tables breaks SPIR-V Backend code generation with complains to G_BRJT. The next example is the shortest code to break SPIR-V Backend code generation in this way: ``` target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" define spir_func void @foo(i32 noundef %val) { entry: switch i32 %val, label %sw.epilog [ i32 0, label %sw.bb i32 1, label %sw.bb2 i32 2, label %sw.bb3 i32 3, label %sw.bb4 ] sw.bb: br label %sw.epilog sw.bb2: br label %sw.epilog sw.bb3: br label %sw.epilog sw.bb4: br label %sw.epilog sw.epilog: ret void } ``` To resolve the issue we set a high lower limit for number of blocks in a jump table via getMinimumJumpTableEntries() and prevent undesirable (or rather unsupported at the moment) path of code generation. --- llvm/lib/Target/SPIRV/SPIRVISelLowering.h | 3 ++ .../CodeGen/SPIRV/switch-no-jump-table.ll | 30 +++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h index f317b26207195..d34f802e9d889 100644 --- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h @@ -31,6 +31,9 @@ class SPIRVTargetLowering : public TargetLowering { return true; } + // prevent creation of jump tables + bool areJTsAllowed(const Function *) const override { return false; } + // This is to prevent sexts of non-i64 vector indices which are generated // within general IRTranslator hence type generation for it is omitted. MVT getVectorIdxTy(const DataLayout &DL) const override { diff --git a/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll new file mode 100644 index 0000000000000..c9c0f17f0b91e --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll @@ -0,0 +1,30 @@ +; The test is to check that jump tables are not generated from switch + +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpSwitch %[[#]] %[[Label:]] +; CHECK-4: OpBranch %[[Label]] + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define spir_func void @foo(i32 noundef %val) { +entry: + switch i32 %val, label %sw.epilog [ + i32 0, label %sw.bb + i32 1, label %sw.bb2 + i32 2, label %sw.bb3 + i32 3, label %sw.bb4 + ] +sw.bb: + br label %sw.epilog +sw.bb2: + br label %sw.epilog +sw.bb3: + br label %sw.epilog +sw.bb4: + br label %sw.epilog +sw.epilog: + ret void +} From bcbffd99c48ed0cabd1b94e9ff252680f0968fc3 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Feb 2024 09:40:46 +0000 Subject: [PATCH 197/351] [AMDGPU] Split Dpp8FI and Dpp16FI operands (#82379) Split Dpp8FI and Dpp16FI into two different operands sharing an AsmOperandClass. They are parsed and rendered identically as fi:1 but the encoding is different: for DPP16 FI is a single bit, but for DPP8 it uses two different special values in the src0 field. Having a dedicated decoder for Dpp8FI allows it to reject other (non-special) src0 values so that AMDGPUDisassembler::getInstruction no longer needs to call isValidDPP8 to do post hoc validation of decoded DPP8 instructions. --- .../Disassembler/AMDGPUDisassembler.cpp | 33 ++++++++----------- .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 19 ++++++----- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 +-- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 18 +++++----- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 ++--- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 4 +-- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 2 +- 8 files changed, 43 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 894607dfdd8c4..53abb3e3f9aea 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, return addOperand(Inst, DAsm->decodeSplitBarrier(Val)); } +static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr, + const MCDisassembler *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeDpp8FI(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \ uint64_t /*Addr*/, \ @@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef &Bytes) { return DecoderUInt128(Lo, Hi); } -// The disassembler is greedy, so we need to check FI operand value to -// not parse a dpp if the correct literal is not set. For dpp16 the -// autogenerated decoder checks the dpp literal -static bool isValidDPP8(const MCInst &MI) { - using namespace llvm::AMDGPU::DPP; - int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi); - assert(FiIdx != -1); - if ((unsigned)FiIdx >= MI.getNumOperands()) - return false; - unsigned Fi = MI.getOperand(FiIdx).getImm(); - return Fi == DPP8_FI_0 || Fi == DPP8_FI_1; -} - DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes_, uint64_t Address, @@ -474,13 +467,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, MI, DecW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; - MI = MCInst(); // clear Res = tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696, MI, DecW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; - MI = MCInst(); // clear const auto convertVOPDPP = [&]() { if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) { @@ -530,26 +521,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, break; if (convertDPP8Inst(MI) == MCDisassembler::Success) break; - MI = MCInst(); // clear } } Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; - MI = MCInst(); // clear Res = tryDecodeInst(DecoderTableDPP8GFX1164, DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; - MI = MCInst(); // clear Res = tryDecodeInst(DecoderTableDPP8GFX1264, DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; - MI = MCInst(); // clear Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); if (Res) break; @@ -982,7 +969,7 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { AMDGPU::OpName::src1_modifiers); } } - return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; + return MCDisassembler::Success; } DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { @@ -1831,6 +1818,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const { return decodeSrcOp(OPW32, Val); } +MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const { + if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1) + return MCOperand(); + return MCOperand::createImm(Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 3142b8a14a4dd..dd0581576bd22 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -261,6 +261,7 @@ class AMDGPUDisassembler : public MCDisassembler { MCOperand decodeBoolReg(unsigned Val) const; MCOperand decodeSplitBarrier(unsigned Val) const; + MCOperand decodeDpp8FI(unsigned Val) const; int getTTmpIdx(unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 97c723752b70b..34cdb09b0e15d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC { } class NamedIntOperand - : CustomOperand { + string name = NAME, string ConvertMethod = "nullptr"> + : CustomOperand { let ParserMethod = "[this](OperandVector &Operands) -> ParseStatus { "# "return parseIntWithPrefix(\""#Prefix#"\", Operands, "# @@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in { def DppRowMask : NamedIntOperand; def DppBankMask : NamedIntOperand; } -def DppBoundCtrl : NamedIntOperand bool { return convertDppBoundCtrl(BC); }">; -def DppFI : NamedIntOperand; + +let DecoderMethod = "decodeDpp8FI" in +def Dpp8FI : NamedIntOperand; +def Dpp16FI : NamedIntOperand; def blgp : CustomOperand; def CBSZ : NamedIntOperand; @@ -1823,7 +1826,7 @@ class getInsDPP16 { dag ret = !con(getInsDPP.ret, - (ins DppFI:$fi)); + (ins Dpp16FI:$fi)); } class getInsDPP8 { dag ret = !con(getInsDPPBase.ret, - (ins dpp8:$dpp8, DppFI:$fi)); + (ins dpp8:$dpp8, Dpp8FI:$fi)); } class getInsVOP3DPPBase { @@ -1851,12 +1854,12 @@ class getInsVOP3DPP { dag ret = !con(getInsVOP3DPP.ret, - (ins DppFI:$fi)); + (ins Dpp16FI:$fi)); } class getInsVOP3DPP8 { dag ret = !con(getInsVOP3DPPBase.ret, - (ins dpp8:$dpp8, DppFI:$fi)); + (ins dpp8:$dpp8, Dpp8FI:$fi)); } // Ins for SDWA diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 99f8e8ede4ace..576ad32a70cf3 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -380,9 +380,9 @@ class VOP_MOVREL : VOPProfile<[untyped, i32, untyped, un let OutsDPP = (outs Src0RC32:$vdst); let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret; - let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi); + let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi); let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret; let OutsVOP3DPP = (outs Src0RC64:$vdst); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 4437d5f2a0333..9f54e69f6d55e 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -430,7 +430,7 @@ class VOP_MAC : VOPProfile <[vt0, vt1, vt1, v getVregSrcForVT.ret:$src2, // stub argument dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsVOP3Base = getInsVOP3Base, 3, 0, HasModifiers, HasModifiers, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret; @@ -447,7 +447,7 @@ class VOP_MAC : VOPProfile <[vt0, vt1, vt1, v let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT.ret:$src2, // stub argument - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, getVregSrcForVT.ret:$src2, // stub argument @@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC { let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT.ret:$src2, // stub argument - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Src2Mod = FP32InputMods; // dummy unused modifiers let Src2RC64 = VGPRSrc_32; // stub argument } @@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/ Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, Src0DPP:$src0, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let OutsVOP3DPP = Outs64; @@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1> Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, Src0DPP:$src0, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let HasExt = 1; let HasExtDPP = 1; @@ -645,11 +645,11 @@ class VOP2e_SGPR ArgVT> : VOPProfile { FPVRegInputMods:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, FPVRegInputMods:$src0_modifiers, Src0DPP:$src0, FPVRegInputMods:$src1_modifiers, Src1DPP:$src1, - dpp8:$dpp8, DppFI:$fi); + dpp8:$dpp8, Dpp8FI:$fi); let Src0ModVOP3DPP = FPVRegInputMods; let Src1ModVOP3DPP = FPVRegInputMods; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 396ae9c9d92ee..7198a4022dae8 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile { FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let InsVOP3DPP8 = (ins VGPR_32:$old, FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, - VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi); + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi); let HasClamp = 0; let HasExtVOP3DPP = 1; @@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, - DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); let InsVOP3DPP8 = (ins VGPR_32:$old, FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, - op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi); + op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 74f451b6d4f7f..a0090f3e8d1db 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile, let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, - neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi); + neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi); let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, - DppBoundCtrl:$bound_ctrl, DppFI:$fi); + DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi); } multiclass VOP3PDOTF8Inst { diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index fe52a0e39e4f1..508f06c4739a5 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -766,7 +766,7 @@ class VOPC_Class_Profile sched, ValueType src0VT, ValueType let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let AsmDPP16 = AsmDPP#"$fi"; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); - let InsDPP16 = !con(InsDPP, (ins DppFI:$fi)); + let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); From 6193233540e55de61baeb80208b06c6808b14dbc Mon Sep 17 00:00:00 2001 From: Yury Gribov Date: Thu, 22 Feb 2024 13:01:37 +0300 Subject: [PATCH 198/351] [AArch64] Fix sched model for TSV110 core. (#82343) Accumulator operand of MADD instruction can be bypassed from another MUL-like operation. Before this fix bypassing was incorrectly applied to multiplier operand. Co-authored-by: Yury Gribov --- llvm/lib/Target/AArch64/AArch64SchedTSV110.td | 6 +- .../AArch64/HiSilicon/tsv110-forwarding.s | 83 +++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 0ae9a69fd4826..1c577a25bf739 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>; def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>; def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>; -def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>; diff --git a/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s new file mode 100644 index 0000000000000..207822b618396 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=tsv110 --instruction-info=0 --resource-pressure=0 --timeline --iterations=1 < %s | FileCheck %s + +# LLVM-MCA-BEGIN madd nobypass +mul x0, x1, x2 +add x0, x0, x1 +add x0, x0, x1 +add x0, x0, x1 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN madd bypass +mul x0, x1, x2 +madd x0, x1, x2, x0 +madd x0, x1, x2, x0 +madd x0, x0, x0, x0 +# LLVM-MCA-END + +# CHECK: [0] Code Region - madd nobypass + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 4 +# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total uOps: 4 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.40 +# CHECK-NEXT: IPC: 0.40 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeeER . mul x0, x1, x2 +# CHECK-NEXT: [0,1] D====eER . add x0, x0, x1 +# CHECK-NEXT: [0,2] D=====eER. add x0, x0, x1 +# CHECK-NEXT: [0,3] D======eER add x0, x0, x1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 mul x0, x1, x2 +# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x0, x1 +# CHECK-NEXT: 2. 1 6.0 0.0 0.0 add x0, x0, x1 +# CHECK-NEXT: 3. 1 7.0 0.0 0.0 add x0, x0, x1 +# CHECK-NEXT: 1 4.8 0.3 0.0 + +# CHECK: [1] Code Region - madd bypass + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 4 +# CHECK-NEXT: Total Cycles: 13 +# CHECK-NEXT: Total uOps: 4 + +# CHECK: Dispatch Width: 4 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 012 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeeER . . mul x0, x1, x2 +# CHECK-NEXT: [0,1] D=eeeeER . . madd x0, x1, x2, x0 +# CHECK-NEXT: [0,2] D==eeeeER . . madd x0, x1, x2, x0 +# CHECK-NEXT: [0,3] D======eeeeER madd x0, x0, x0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 mul x0, x1, x2 +# CHECK-NEXT: 1. 1 2.0 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 2. 1 3.0 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 3. 1 7.0 0.0 0.0 madd x0, x0, x0, x0 +# CHECK-NEXT: 1 3.3 0.3 0.0 From 4a602d9250e1eb3c729d0421d11be2be8693cbf2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy <89994100+VyacheslavLevytskyy@users.noreply.github.com> Date: Thu, 22 Feb 2024 11:05:19 +0100 Subject: [PATCH 199/351] Add support for the SPV_INTEL_usm_storage_classes extension (#82247) Add support for the SPV_INTEL_usm_storage_classes extension: * https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_usm_storage_classes.asciidoc --- llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp | 17 ++-- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 5 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 4 + .../Target/SPIRV/SPIRVInstructionSelector.cpp | 36 ++++++-- llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 16 ++-- llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 7 ++ llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 11 ++- llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp | 6 ++ .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 3 + llvm/lib/Target/SPIRV/SPIRVUtils.cpp | 19 ++++- llvm/lib/Target/SPIRV/SPIRVUtils.h | 3 +- .../intel-usm-addrspaces.ll | 84 +++++++++++++++++++ 12 files changed, 183 insertions(+), 28 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index cc438b2bb8d4d..10569ef0468bd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) { static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, SPIRVGlobalRegistry *GR, - MachineIRBuilder &MIRBuilder) { + MachineIRBuilder &MIRBuilder, + const SPIRVSubtarget &ST) { // Read argument's access qualifier from metadata or default. SPIRV::AccessQualifier::AccessQualifier ArgAccessQual = getArgAccessQual(F, ArgIdx); @@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, if (MDTypeStr.ends_with("*")) ResArgType = GR->getOrCreateSPIRVTypeByName( MDTypeStr, MIRBuilder, - addressSpaceToStorageClass( - OriginalArgType->getPointerAddressSpace())); + addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(), + ST)); else if (MDTypeStr.ends_with("_t")) ResArgType = GR->getOrCreateSPIRVTypeByName( "opencl." + MDTypeStr.str(), MIRBuilder, @@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, assert(GR && "Must initialize the SPIRV type registry before lowering args."); GR->setCurrentFunc(MIRBuilder.getMF()); + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast(&MIRBuilder.getMF().getSubtarget()); + // Assign types and names to all args, and store their types for later. FunctionType *FTy = getOriginalFunctionType(F); SmallVector ArgTypeVRegs; @@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // TODO: handle the case of multiple registers. if (VRegs[i].size() > 1) return false; - auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder); + auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST); GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF()); ArgTypeVRegs.push_back(SpirvTy); @@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (F.hasName()) buildOpName(FuncVReg, F.getName(), MIRBuilder); - // Get access to information about available extensions - const auto *ST = - static_cast(&MIRBuilder.getMF().getSubtarget()); - // Handle entry points and function linkage. if (isEntryPoint(F)) { const auto &STI = MIRBuilder.getMF().getSubtarget(); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 47fec745c3f18..a1cb630f1aa47 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType( // TODO: change the implementation once opaque pointers are supported // in the SPIR-V specification. SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder); - auto SC = addressSpaceToStorageClass(PType->getAddressSpace()); + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast(&MIRBuilder.getMF().getSubtarget()); + auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST); // Null pointer means we have a loop in type definitions, make and // return corresponding OpTypeForwardPointer. if (SpvElementType == nullptr) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 86f65b6320d53..7c5252e8cb372 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor "$r = OpGenericCastToPtrExplicit $t $p $s">; def OpBitcast : UnOp<"OpBitcast", 124>; +// SPV_INTEL_usm_storage_classes +def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>; +def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>; + // 3.42.12 Composite Instructions def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx), diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 53d19a1e31382..7258d3b4d88ed 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) { } } +static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) { + switch (SC) { + case SPIRV::StorageClass::DeviceOnlyINTEL: + case SPIRV::StorageClass::HostOnlyINTEL: + return true; + default: + return false; + } +} + // In SPIR-V address space casting can only happen to and from the Generic -// storage class. We can also only case Workgroup, CrossWorkgroup, or Function +// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function // pointers to and from Generic pointers. As such, we can convert e.g. from // Workgroup to Function by going via a Generic pointer as an intermediary. All // other combinations can only be done by a bitcast, and are probably not safe. @@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr); SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg); - // Casting from an eligable pointer to Generic. + // don't generate a cast between identical storage classes + if (SrcSC == DstSC) + return true; + + // Casting from an eligible pointer to Generic. if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC)) return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric); - // Casting from Generic to an eligable pointer. + // Casting from Generic to an eligible pointer. if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC)) return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr); - // Casting between 2 eligable pointers using Generic as an intermediary. + // Casting between 2 eligible pointers using Generic as an intermediary. if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) { Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass); SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType( @@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, .addUse(Tmp) .constrainAllUses(TII, TRI, RBI); } + + // Check if instructions from the SPV_INTEL_usm_storage_classes extension may + // be applied + if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup) + return selectUnOp(ResVReg, ResType, I, + SPIRV::OpPtrCastToCrossWorkgroupINTEL); + if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC)) + return selectUnOp(ResVReg, ResType, I, + SPIRV::OpCrossWorkgroupCastToPtrINTEL); + // TODO Should this case just be disallowed completely? // We're casting 2 other arbitrary address spaces, so have to bitcast. return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast); @@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( } SPIRVType *ResType = GR.getOrCreateSPIRVPointerType( PointerBaseType, I, TII, - addressSpaceToStorageClass(GV->getAddressSpace())); + addressSpaceToStorageClass(GV->getAddressSpace(), STI)); std::string GlobalIdent; if (!GV->hasName()) { @@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue( unsigned AddrSpace = GV->getAddressSpace(); SPIRV::StorageClass::StorageClass Storage = - addressSpaceToStorageClass(AddrSpace); + addressSpaceToStorageClass(AddrSpace, STI); bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage && Storage != SPIRV::StorageClass::Function; SPIRV::LinkageType::LinkageType LnkType = diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 011a550a7b3d9..4f2e7a240fc2c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { const LLT p2 = LLT::pointer(2, PSize); // UniformConstant const LLT p3 = LLT::pointer(3, PSize); // Workgroup const LLT p4 = LLT::pointer(4, PSize); // Generic - const LLT p5 = LLT::pointer(5, PSize); // Input + const LLT p5 = + LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device) + const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host) // TODO: remove copy-pasting here by using concatenation in some way. auto allPtrsScalarsAndVectors = { - p0, p1, p2, p3, p4, p5, s1, s8, s16, - s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, - v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, - v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; + p0, p1, p2, p3, p4, p5, p6, s1, s8, s16, + s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, v3s16, + v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, v8s1, v8s8, v8s16, + v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; auto allScalarsAndVectors = { s1, s8, s16, s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, @@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { auto allFloatAndIntScalars = allIntScalars; - auto allPtrs = {p0, p1, p2, p3, p4, p5}; - auto allWritablePtrs = {p0, p1, p3, p4}; + auto allPtrs = {p0, p1, p2, p3, p4, p5, p6}; + auto allWritablePtrs = {p0, p1, p3, p4, p5, p6}; for (auto Opc : TypeFoldingSupportingOpcs) getActionDefinitionsBuilder(Opc).custom(); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 9b9575b987994..3be28c97d9538 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1063,6 +1063,13 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR); } break; + case SPIRV::OpPtrCastToCrossWorkgroupINTEL: + case SPIRV::OpCrossWorkgroupCastToPtrINTEL: + if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) { + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes); + Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL); + } + break; case SPIRV::OpConstantFunctionPointerINTEL: if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) { Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers); diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index cbc16fa986614..144216896eb68 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) { static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast(&MIB.getMF().getSubtarget()); SmallVector ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB); SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( BaseTy, MI, *MF.getSubtarget().getInstrInfo(), - addressSpaceToStorageClass(MI.getOperand(4).getImm())); + addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST)); // If the bitcast would be redundant, replace all uses with the source // register. @@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy, static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { + // Get access to information about available extensions + const SPIRVSubtarget *ST = + static_cast(&MIB.getMF().getSubtarget()); + MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector ToErase; @@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB); SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( BaseTy, MI, *MF.getSubtarget().getInstrInfo(), - addressSpaceToStorageClass(MI.getOperand(3).getImm())); + addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST)); MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB, diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index 4694363614ef6..79f16146ccd94 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -49,6 +49,12 @@ cl::list Extensions( clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone", "Adds OptNoneINTEL value for Function Control mask that " "indicates a request to not optimize the function."), + clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes, + "SPV_INTEL_usm_storage_classes", + "Introduces two new storage classes that are sub classes of " + "the CrossWorkgroup storage class " + "that provides additional information that can enable " + "optimization."), clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups", "Allows work items in a subgroup to share data without the " "use of local memory and work group barriers, and to " diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 6c36087baa85e..b022b97408d7d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -463,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; +defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; //===----------------------------------------------------------------------===// // Multiclass used to define SourceLanguage enum values and at the same time @@ -700,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>; defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>; defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>; defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>; +defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>; +defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Dim enum values and at the same time diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 05f766d3ec548..169d7cc93897e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -14,6 +14,7 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "SPIRV.h" #include "SPIRVInstrInfo.h" +#include "SPIRVSubtarget.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) { return 3; case SPIRV::StorageClass::Generic: return 4; + case SPIRV::StorageClass::DeviceOnlyINTEL: + return 5; + case SPIRV::StorageClass::HostOnlyINTEL: + return 6; case SPIRV::StorageClass::Input: return 7; default: - llvm_unreachable("Unable to get address space id"); + report_fatal_error("Unable to get address space id"); } } SPIRV::StorageClass::StorageClass -addressSpaceToStorageClass(unsigned AddrSpace) { +addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) { switch (AddrSpace) { case 0: return SPIRV::StorageClass::Function; @@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) { return SPIRV::StorageClass::Workgroup; case 4: return SPIRV::StorageClass::Generic; + case 5: + return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes) + ? SPIRV::StorageClass::DeviceOnlyINTEL + : SPIRV::StorageClass::CrossWorkgroup; + case 6: + return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes) + ? SPIRV::StorageClass::HostOnlyINTEL + : SPIRV::StorageClass::CrossWorkgroup; case 7: return SPIRV::StorageClass::Input; default: - llvm_unreachable("Unknown address space"); + report_fatal_error("Unknown address space"); } } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index a33dc02f854f5..1af53dcd0c4cd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -27,6 +27,7 @@ class MachineRegisterInfo; class Register; class StringRef; class SPIRVInstrInfo; +class SPIRVSubtarget; // Add the given string as a series of integer operand, inserting null // terminators and padding to make sure the operands all have 32-bit @@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC); // Convert an LLVM IR address space to a SPIR-V storage class. SPIRV::StorageClass::StorageClass -addressSpaceToStorageClass(unsigned AddrSpace); +addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI); SPIRV::MemorySemantics::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC); diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll new file mode 100644 index 0000000000000..30c16350bf2b1 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll @@ -0,0 +1,84 @@ +; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_usm_storage_classes/intel_usm_addrspaces.ll + +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-EXT +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - -filetype=obj | spirv-val %} +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-WITHOUT +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-: Capability USMStorageClassesINTEL +; CHECK-SPIRV-WITHOUT-NO: Capability USMStorageClassesINTEL +; CHECK-SPIRV-EXT-DAG: %[[DevTy:[0-9]+]] = OpTypePointer DeviceOnlyINTEL %[[#]] +; CHECK-SPIRV-EXT-DAG: %[[HostTy:[0-9]+]] = OpTypePointer HostOnlyINTEL %[[#]] +; CHECK-SPIRV-DAG: %[[CrsWrkTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[#]] + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @foo_kernel() { +entry: + ret void +} + +; CHECK-SPIRV: %[[Ptr1:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]] +; CHECK-SPIRV-EXT: %[[CastedPtr1:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[DevTy]] %[[Ptr1]] +; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL +; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr1]] +define spir_func void @test1(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) { +entry: + %arg_glob.addr = alloca ptr addrspace(1), align 4 + %arg_dev.addr = alloca ptr addrspace(5), align 4 + store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4 + store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4 + %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4 + %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(5) + store ptr addrspace(5) %casted_ptr, ptr %arg_dev.addr, align 4 + ret void +} + +; CHECK-SPIRV: %[[Ptr2:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]] +; CHECK-SPIRV-EXT: %[[CastedPtr2:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[HostTy]] %[[Ptr2]] +; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL +; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr2]] +define spir_func void @test2(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) { +entry: + %arg_glob.addr = alloca ptr addrspace(1), align 4 + %arg_host.addr = alloca ptr addrspace(6), align 4 + store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4 + store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4 + %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4 + %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(6) + store ptr addrspace(6) %casted_ptr, ptr %arg_host.addr, align 4 + ret void +} + +; CHECK-SPIRV-EXT: %[[Ptr3:[0-9]+]] = OpLoad %[[DevTy]] %[[#]] +; CHECK-SPIRV-EXT: %[[CastedPtr3:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr3]] +; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL +; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr3]] +define spir_func void @test3(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) { +entry: + %arg_glob.addr = alloca ptr addrspace(1), align 4 + %arg_dev.addr = alloca ptr addrspace(5), align 4 + store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4 + store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4 + %loaded_dev = load ptr addrspace(5), ptr %arg_dev.addr, align 4 + %casted_ptr = addrspacecast ptr addrspace(5) %loaded_dev to ptr addrspace(1) + store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4 + ret void +} + +; CHECK-SPIRV-EXT: %[[Ptr4:[0-9]+]] = OpLoad %[[HostTy]] %[[#]] +; CHECK-SPIRV-EXT: %[[CastedPtr4:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr4]] +; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL +; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr4]] +define spir_func void @test4(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) { +entry: + %arg_glob.addr = alloca ptr addrspace(1), align 4 + %arg_host.addr = alloca ptr addrspace(6), align 4 + store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4 + store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4 + %loaded_host = load ptr addrspace(6), ptr %arg_host.addr, align 4 + %casted_ptr = addrspacecast ptr addrspace(6) %loaded_host to ptr addrspace(1) + store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4 + ret void +} From f01719afaae9a208ac272d99760d18e4c16d9241 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 22 Feb 2024 10:21:12 +0000 Subject: [PATCH 200/351] [mlir][test] Add integration tests for vector.interleave (#80969) --- .../CPU/ArmSVE/test-scalable-interleave.mlir | 24 +++++++++++++++++++ .../Dialect/Vector/CPU/test-interleave.mlir | 24 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir new file mode 100644 index 0000000000000..8ae3eee6462ca --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir @@ -0,0 +1,24 @@ +// RUN: mlir-opt %s -test-lower-to-llvm | \ +// RUN: %mcr_aarch64_cmd -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_c_runner_utils,%mlir_arm_runner_utils | \ +// RUN: FileCheck %s + +func.func @entry() { + %f1 = arith.constant 1.0 : f32 + %f2 = arith.constant 2.0 : f32 + %v1 = vector.splat %f1 : vector<[4]xf32> + %v2 = vector.splat %f2 : vector<[4]xf32> + vector.print %v1 : vector<[4]xf32> + vector.print %v2 : vector<[4]xf32> + // + // Test vectors: + // + // CHECK: ( 1, 1, 1, 1 + // CHECK: ( 2, 2, 2, 2 + + %v3 = vector.interleave %v1, %v2 : vector<[4]xf32> + vector.print %v3 : vector<[8]xf32> + // CHECK: ( 1, 2, 1, 2, 1, 2, 1, 2 + + return +} diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir new file mode 100644 index 0000000000000..0bc78af6aba03 --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir @@ -0,0 +1,24 @@ +// RUN: mlir-opt %s -test-lower-to-llvm | \ +// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ +// RUN: -shared-libs=%mlir_c_runner_utils | \ +// RUN: FileCheck %s + +func.func @entry() { + %f1 = arith.constant 1.0 : f32 + %f2 = arith.constant 2.0 : f32 + %v1 = vector.splat %f1 : vector<2x4xf32> + %v2 = vector.splat %f2 : vector<2x4xf32> + vector.print %v1 : vector<2x4xf32> + vector.print %v2 : vector<2x4xf32> + // + // Test vectors: + // + // CHECK: ( ( 1, 1, 1, 1 ), ( 1, 1, 1, 1 ) ) + // CHECK: ( ( 2, 2, 2, 2 ), ( 2, 2, 2, 2 ) ) + + %v3 = vector.interleave %v1, %v2 : vector<2x4xf32> + vector.print %v3 : vector<2x8xf32> + // CHECK: ( ( 1, 2, 1, 2, 1, 2, 1, 2 ), ( 1, 2, 1, 2, 1, 2, 1, 2 ) ) + + return +} From e4d4ebe0415b9f1fd8cb034ac68f0616f12facf2 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 22 Feb 2024 10:22:07 +0000 Subject: [PATCH 201/351] [llvm][llvm-jitlink] Disable test on Windows on Arm This fails on one of our bots: https://lab.llvm.org/buildbot/#/builders/120/builds/6309 llvm-jitlink error: Unsupported target machine architecture in COFF object The other bot doesn't run the test at all it seems but I can't explain why. It's also possible that I'm mistaken and the mostly native but still "cross compiling" setup we have on WoA means an x86 object is produced sometimes (perhaps because a default triple is still x86). --- llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test index 33ad5515a6357..ec71011d545eb 100644 --- a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test +++ b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test @@ -5,4 +5,7 @@ # # Use -sectcreate to create a section from a data file. +# Jitlink does not support ARM64 COFF files. +# UNSUPPORTED: target=aarch64-pc-windows-{{.*}} + # jitlink-check: *{4}foo = 0x2a2a5a5a \ No newline at end of file From b9ce237980b5a636e87e3578609c812833f7537f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Feb 2024 10:39:43 +0000 Subject: [PATCH 202/351] [AMDGPU] Clean up conversion of DPP instructions in AMDGPUDisassembler (#82480) Convert DPP instructions after all calls to tryDecodeInst, just like we do for all other instruction types. NFCI. --- .../Disassembler/AMDGPUDisassembler.cpp | 127 ++++++++---------- 1 file changed, 53 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 53abb3e3f9aea..c5d06dea92c30 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -465,36 +465,25 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696, MI, DecW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (Res) break; + Res = tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696, MI, DecW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (Res) break; - const auto convertVOPDPP = [&]() { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) { - convertVOP3PDPPInst(MI); - } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) { - convertVOPCDPPInst(MI); // Special VOP3 case - } else { - assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); - convertVOP3DPPInst(MI); // Regular VOP3 case - } - }; Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696, MI, DecW, Address, CS); - if (Res) { - convertVOPDPP(); + if (Res) break; - } + Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696, MI, DecW, Address, CS); - if (Res) { - convertVOPDPP(); + if (Res) break; - } + Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS); if (Res) break; @@ -515,27 +504,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS); - if (Res) { - if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) - == -1) - break; - if (convertDPP8Inst(MI) == MCDisassembler::Success) - break; - } + if (Res) + break; } Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (Res) break; Res = tryDecodeInst(DecoderTableDPP8GFX1164, DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (Res) break; Res = tryDecodeInst(DecoderTableDPP8GFX1264, DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS); - if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + if (Res) break; Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); @@ -543,19 +527,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664, MI, QW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) - convertVOPCDPPInst(MI); + if (Res) break; - } Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664, MI, QW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) - convertVOPCDPPInst(MI); + if (Res) break; - } if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS); @@ -652,6 +630,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Address, CS); } while (false); + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP)) { + if (isMacDPP(MI)) + convertMacDPPInst(MI); + + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) + convertVOP3PDPPInst(MI); + else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) || + AMDGPU::isVOPC64DPP(MI.getOpcode())) + convertVOPCDPPInst(MI); // Special VOP3 case + else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) != + -1) + convertDPP8Inst(MI); + else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) + convertVOP3DPPInst(MI); // Regular VOP3 case + } + if (Res && AMDGPU::isMAC(MI.getOpcode())) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), @@ -926,56 +920,41 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { AMDGPU::OpName::src2_modifiers); } -// We must check FI == literal to reject not genuine dpp8 insts, and we must -// first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); - if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { - convertVOP3PDPPInst(MI); - } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || - AMDGPU::isVOPC64DPP(Opc)) { - convertVOPCDPPInst(MI); - } else { - if (isMacDPP(MI)) - convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); - int VDstInIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); - if (VDstInIdx != -1) - insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); - if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || - MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) - insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + convertTrue16OpSel(MI); + auto Mods = collectVOPModifiers(MI); + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), + AMDGPU::OpName::op_sel); + } else { + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers)) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); - unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { - convertTrue16OpSel(MI); - auto Mods = collectVOPModifiers(MI); - insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), - AMDGPU::OpName::op_sel); - } else { - // Insert dummy unused src modifiers. - if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers)) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src0_modifiers); - - if (MI.getNumOperands() < DescNumOps && - AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src1_modifiers); - } + AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers)) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); } return MCDisassembler::Success; } DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { - if (isMacDPP(MI)) - convertMacDPPInst(MI); - convertTrue16OpSel(MI); int VDstInIdx = From 4f12f47550eee85447c9ec37d27a20c6593d3d40 Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Thu, 22 Feb 2024 10:45:27 +0000 Subject: [PATCH 203/351] [AArch64] Switch to soft promoting half types. (#80576) The traditional promotion is known to generate wrong code. Like #80440 for ARM, except that far less is affected as on AArch64, hardware floating point support always includes FP16 support and is unaffected by these changes. This only affects `-mgeneral-regs-only` (Clang) / `-mattr=-fp-armv8` (LLVM). Because this only affects a configuration where no FP support is available at all, `useFPRegsForHalfType()` has no effect and is not specified: `f32` was getting legalized as a parameter and return type to an integer anyway. --- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + .../AArch64/strictfp_f16_abi_promote.ll | 140 +++--------------- 2 files changed, 26 insertions(+), 116 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 436b21fd13463..bec13484450d7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1308,6 +1308,8 @@ class AArch64TargetLowering : public TargetLowering { bool preferScalarizeSplat(SDNode *N) const override; unsigned getMinimumJumpTableEntries() const override; + + bool softPromoteHalfType() const override { return true; } }; namespace AArch64 { diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll index 37186cf22ccc7..a34f7abcc22a3 100644 --- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -70,22 +70,20 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w22, -32 ; NOFP16-NEXT: .cfi_offset w30, -48 ; NOFP16-NEXT: mov w21, w0 -; NOFP16-NEXT: and w0, w2, #0xffff +; NOFP16-NEXT: and w0, w1, #0xffff ; NOFP16-NEXT: mov x19, x3 -; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: mov w20, w2 ; NOFP16-NEXT: bl __gnu_h2f_ieee ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff ; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: mov w8, w0 ; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: orr x21, x8, x22, lsl #32 ; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w8, w21 -; NOFP16-NEXT: // kill: def $w0 killed $w0 def $x0 -; NOFP16-NEXT: str w22, [x19, #8] -; NOFP16-NEXT: orr x8, x8, x0, lsl #32 +; NOFP16-NEXT: str x21, [x19] ; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; NOFP16-NEXT: str x8, [x19] +; NOFP16-NEXT: str w0, [x19, #8] ; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; NOFP16-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload ; NOFP16-NEXT: ret @@ -182,46 +180,17 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { define void @outgoing_v4f16_return(ptr %ptr) #0 { ; NOFP16-LABEL: outgoing_v4f16_return: ; NOFP16: // %bb.0: -; NOFP16-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill -; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 ; NOFP16-NEXT: .cfi_offset w19, -8 -; NOFP16-NEXT: .cfi_offset w20, -16 -; NOFP16-NEXT: .cfi_offset w21, -24 -; NOFP16-NEXT: .cfi_offset w22, -32 -; NOFP16-NEXT: .cfi_offset w23, -40 -; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: .cfi_offset w30, -16 ; NOFP16-NEXT: mov x19, x0 ; NOFP16-NEXT: bl v4f16_result -; NOFP16-NEXT: and w0, w0, #0xffff -; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: mov w21, w2 -; NOFP16-NEXT: mov w22, w3 -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w23, w0 -; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w20, w0 -; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w21, w0 -; NOFP16-NEXT: and w0, w22, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #6] -; NOFP16-NEXT: mov w0, w21 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #4] -; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #2] -; NOFP16-NEXT: mov w0, w23 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w2, [x19, #4] +; NOFP16-NEXT: strh w3, [x19, #6] +; NOFP16-NEXT: strh w1, [x19, #2] ; NOFP16-NEXT: strh w0, [x19] -; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; NOFP16-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload ; NOFP16-NEXT: ret %val = call <4 x half> @v4f16_result() store <4 x half> %val, ptr %ptr @@ -231,82 +200,21 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { define void @outgoing_v8f16_return(ptr %ptr) #0 { ; NOFP16-LABEL: outgoing_v8f16_return: ; NOFP16: // %bb.0: -; NOFP16-NEXT: stp x30, x27, [sp, #-80]! // 16-byte Folded Spill -; NOFP16-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NOFP16-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NOFP16-NEXT: .cfi_def_cfa_offset 80 +; NOFP16-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 ; NOFP16-NEXT: .cfi_offset w19, -8 -; NOFP16-NEXT: .cfi_offset w20, -16 -; NOFP16-NEXT: .cfi_offset w21, -24 -; NOFP16-NEXT: .cfi_offset w22, -32 -; NOFP16-NEXT: .cfi_offset w23, -40 -; NOFP16-NEXT: .cfi_offset w24, -48 -; NOFP16-NEXT: .cfi_offset w25, -56 -; NOFP16-NEXT: .cfi_offset w26, -64 -; NOFP16-NEXT: .cfi_offset w27, -72 -; NOFP16-NEXT: .cfi_offset w30, -80 +; NOFP16-NEXT: .cfi_offset w30, -16 ; NOFP16-NEXT: mov x19, x0 ; NOFP16-NEXT: bl v8f16_result -; NOFP16-NEXT: and w0, w0, #0xffff -; NOFP16-NEXT: mov w21, w1 -; NOFP16-NEXT: mov w22, w2 -; NOFP16-NEXT: mov w23, w3 -; NOFP16-NEXT: mov w24, w4 -; NOFP16-NEXT: mov w25, w5 -; NOFP16-NEXT: mov w26, w6 -; NOFP16-NEXT: mov w27, w7 -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w20, w0 -; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w21, w0 -; NOFP16-NEXT: and w0, w22, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w22, w0 -; NOFP16-NEXT: and w0, w23, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w23, w0 -; NOFP16-NEXT: and w0, w24, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w24, w0 -; NOFP16-NEXT: and w0, w25, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w25, w0 -; NOFP16-NEXT: and w0, w26, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: mov w26, w0 -; NOFP16-NEXT: and w0, w27, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #14] -; NOFP16-NEXT: mov w0, w26 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #12] -; NOFP16-NEXT: mov w0, w25 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #10] -; NOFP16-NEXT: mov w0, w24 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #8] -; NOFP16-NEXT: mov w0, w23 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #6] -; NOFP16-NEXT: mov w0, w22 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #4] -; NOFP16-NEXT: mov w0, w21 -; NOFP16-NEXT: bl __gnu_f2h_ieee -; NOFP16-NEXT: strh w0, [x19, #2] -; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w5, [x19, #10] +; NOFP16-NEXT: strh w7, [x19, #14] +; NOFP16-NEXT: strh w6, [x19, #12] +; NOFP16-NEXT: strh w4, [x19, #8] +; NOFP16-NEXT: strh w3, [x19, #6] +; NOFP16-NEXT: strh w2, [x19, #4] +; NOFP16-NEXT: strh w1, [x19, #2] ; NOFP16-NEXT: strh w0, [x19] -; NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NOFP16-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NOFP16-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NOFP16-NEXT: ldp x30, x27, [sp], #80 // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload ; NOFP16-NEXT: ret %val = call <8 x half> @v8f16_result() store <8 x half> %val, ptr %ptr From 3b7d43301e3662da4197cef7948c18fab850d9c4 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 22 Feb 2024 11:18:18 +0000 Subject: [PATCH 204/351] [AMDGPU] Remove DPP DecoderNamespaces. NFC. (#82491) Now that there is no special checking for valid DPP encodings, these instructions can use the same DecoderNamespace as other 64- or 96-bit instructions. Also clean up setting DecoderNamespace: in most cases it should be set as a pair with AssemblerPredicate. --- .../Disassembler/AMDGPUDisassembler.cpp | 57 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 75 ++- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 36 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 6 +- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 498 ++++++++---------- llvm/lib/Target/AMDGPU/VOPInstructions.td | 16 +- 6 files changed, 288 insertions(+), 400 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index c5d06dea92c30..70e2275c58745 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -462,33 +462,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); - Res = - tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696, - MI, DecW, Address, CS); + Res = tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI, + DecW, Address, CS); if (Res) break; - Res = - tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696, - MI, DecW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696, - MI, DecW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696, - MI, DecW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS); + Res = tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI, + DecW, Address, CS); if (Res) break; @@ -508,33 +488,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, break; } - Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableDPP8GFX1164, - DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableDPP8GFX1264, - DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); - if (Res) break; - - Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664, - MI, QW, Address, CS); - if (Res) - break; - - Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664, - MI, QW, Address, CS); - if (Res) - break; - if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS); if (Res) @@ -593,7 +546,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, break; } - // Reinitialize Bytes as DPP64 could have eaten too much + // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); // Try decode 32-bit instruction diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 576ad32a70cf3..f5424cf48d7a5 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -749,7 +749,7 @@ class VOP1_DPP16 op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p class VOP1_DPP16_Gen op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP1_DPP16 { let AssemblerPredicate = Gen.AssemblerPredicate; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; } class VOP1_DPP8 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -770,7 +770,7 @@ class VOP1_DPP8 op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : class VOP1_DPP8_Gen op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP1_DPP8 { let AssemblerPredicate = Gen.AssemblerPredicate; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; } //===----------------------------------------------------------------------===// @@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name op, string opName, string asmName> { defvar ps = !cast(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16, - DecoderNamespace = "DPP" # Gen.DecoderNamespace # + DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { defm NAME : VOP1_Real_dpp; } @@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name op, string opName, string asmName> { defvar ps = !cast(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8, - DecoderNamespace = "DPP8" # Gen.DecoderNamespace # + DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in { defm NAME : VOP1_Real_dpp8; } @@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP1_Real_dpp8_gfx10 op> { if !cast(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP1_DPP8(NAME#"_e32")> { - let DecoderNamespace = "DPP8"; - } + def _dpp8_gfx10 : VOP1_DPP8(NAME#"_e32")>; } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" @@ -1192,16 +1190,14 @@ class VOP1_DPPe op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31-25} = 0x3f; //encoding } -multiclass VOP1Only_Real_vi op> { - let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { +let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { + multiclass VOP1Only_Real_vi op> { def _vi : VOP1_Real(NAME), SIEncodingFamily.VI>, VOP1e(NAME).Pfl>; } -} -multiclass VOP1_Real_e32e64_vi op> { - let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in { + multiclass VOP1_Real_e32e64_vi op> { def _e32_vi : VOP1_Real(NAME#"_e32"), SIEncodingFamily.VI>, VOP1e(NAME#"_e32").Pfl>; @@ -1389,44 +1385,41 @@ def : GCNPat < // GFX9 //===----------------------------------------------------------------------===// -multiclass VOP1_Real_gfx9 op> { - let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { +let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { + multiclass VOP1_Real_gfx9 op> { defm NAME : VOP1_Real_e32e64_vi ; - } - - if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then - def _sdwa_gfx9 : - VOP_SDWA9_Real (NAME#"_sdwa")>, - VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; - - if !cast(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : - VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP1_DPPe(NAME#"_dpp")>; - -} -multiclass VOP1_Real_NoDstSel_SDWA_gfx9 op> { - let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { - defm NAME : VOP1_Real_e32e64_vi ; + if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then + def _sdwa_gfx9 : + VOP_SDWA9_Real (NAME#"_sdwa")>, + VOP1_SDWA9Ae (NAME#"_sdwa").Pfl>; + + if !cast(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe(NAME#"_dpp")>; } - if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then - def _sdwa_gfx9 : - VOP_SDWA9_Real (NAME#"_sdwa")>, - VOP1_SDWA9Ae (NAME#"_sdwa").Pfl> { - let Inst{42-40} = 6; - } + multiclass VOP1_Real_NoDstSel_SDWA_gfx9 op> { + defm NAME : VOP1_Real_e32e64_vi ; - if !cast(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx9 : - VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP1_DPPe(NAME#"_dpp")>; + if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then + def _sdwa_gfx9 : + VOP_SDWA9_Real (NAME#"_sdwa")>, + VOP1_SDWA9Ae (NAME#"_sdwa").Pfl> { + let Inst{42-40} = 6; + } + + if !cast(NAME#"_e32").Pfl.HasExtDPP then + def _dpp_gfx9 : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe(NAME#"_dpp")>; + } } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; -let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in +let AssemblerPredicate = isGFX940Plus in defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; let OtherPredicates = [HasFP8ConversionInsts] in { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 9f54e69f6d55e..13fe79b475960 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen op, VOP2_DPP_Pseudo ps, GFXGen Gen, VOP2_DPP16 { let AssemblerPredicate = Gen.AssemblerPredicate; let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen op, VOP2_Pseudo ps, GFXGen Gen, VOP2_DPP8 { let AssemblerPredicate = Gen.AssemblerPredicate; let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []); - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp8_gfx10 op> { if !cast(NAME#"_e32").Pfl.HasExt32BitDPP then - def _dpp8_gfx10 : VOP2_DPP8(NAME#"_e32")> { - let DecoderNamespace = "DPP8"; - } + def _dpp8_gfx10 : VOP2_DPP8(NAME#"_e32")>; } //===------------------------- VOP2 (with name) -------------------------===// @@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { def _dpp8_gfx10 : VOP2_DPP8(opName#"_e32")> { VOP2_Pseudo ps = !cast(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"; } } @@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { VOP2_DPP8(opName#"_e32")> { string AsmDPP8 = !cast(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); - let DecoderNamespace = "DPP8"; } if !cast(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_w32_gfx10 : @@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real op> { VOP2_SDWA9Ae (NAME#"_sdwa").Pfl>; } -let AssemblerPredicate = isGFX8Only in { +let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in { multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName> { def _e32_vi : @@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName VOP2e(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX8"; } def _e64_vi : VOP3_Real(OpName#"_e64"), SIEncodingFamily.VI>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX8"; } if !cast(OpName#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : @@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only op, string OpName, string AsmName let AsmString = AsmName # ps.AsmOperands; } } -} -let AssemblerPredicate = isGFX9Only in { +} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" + +let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { multiclass VOP2be_Real_e32e64_gfx9 op, string OpName, string AsmName> { def _e32_gfx9 : @@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 op, string OpName, string AsmName> { VOP2e(OpName#"_e32").Pfl> { VOP2_Pseudo ps = !cast(OpName#"_e32"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } def _e64_gfx9 : VOP3_Real(OpName#"_e64"), SIEncodingFamily.GFX9>, VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast(OpName#"_e64").Pfl> { VOP3_Pseudo ps = !cast(OpName#"_e64"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } if !cast(OpName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : @@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 op, string OpName, string AsmName> { VOP2_DPPe(OpName#"_dpp")> { VOP2_DPP_Pseudo ps = !cast(OpName#"_dpp"); let AsmString = AsmName # ps.AsmOperands; - let DecoderNamespace = "GFX9"; } } multiclass VOP2_Real_e32e64_gfx9 op> { def _e32_gfx9 : VOP2_Real(NAME#"_e32"), SIEncodingFamily.GFX9>, - VOP2e(NAME#"_e32").Pfl>{ - let DecoderNamespace = "GFX9"; - } + VOP2e(NAME#"_e32").Pfl>; def _e64_gfx9 : VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX9>, - VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl> { - let DecoderNamespace = "GFX9"; - } + VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast(NAME#"_e64").Pfl>; if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real (NAME#"_sdwa")>, @@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 op> { if !cast(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX9>, - VOP2_DPPe(NAME#"_dpp")> { - let DecoderNamespace = "GFX9"; - } + VOP2_DPPe(NAME#"_dpp")>; } -} // AssemblerPredicate = isGFX9Only +} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" multiclass VOP2_Real_e32e64_vi op> : Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index a0090f3e8d1db..cf76de40aef41 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1486,7 +1486,7 @@ multiclass VOP3P_Real_dpp op, string backing_ps_name = NAME, : VOP3P_DPP16(backing_ps_name #"_dpp"), Gen.Subtarget> { let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1496,7 +1496,7 @@ multiclass VOP3P_Real_dpp8 op, string backing_ps_name = NAME defvar ps = !cast(backing_ps_name); def _dpp8#Gen.Suffix : VOP3P_DPP8_Base { let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1613,7 +1613,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases op, string Name = !cast(NAME#"_e64").Mnemonic, VOP3_Pseudo PS_ACD = !cast(NAME # "_e64"), VOP3_Pseudo PS_VCD = !cast(NAME # "_vgprcd" # "_e64")> { - let SubtargetPredicate = isGFX940Plus, + let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940", AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { def _gfx940_acd : VOP3P_Real, diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 508f06c4739a5..e5e82447d55fb 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -222,6 +222,8 @@ class VOPCInstAlias { @@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst op, VOP_Pseudo ps, string opName = ps.OpName> //===----------------------------------------------------------------------===// multiclass VOPC_Real_Base op> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast(NAME#"_e32"); defvar ps64 = !cast(NAME#"_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : VOPC_Real, - VOPCe; - def _e64#Gen.Suffix : VOP3_Real, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : VOPC_Real, + VOPCe; + def _e64#Gen.Suffix : VOP3_Real, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast(NAME #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC; - def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16 { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16 { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC; + def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16 { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16 { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8; - def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8 { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8 { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8; + def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8 { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8 { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast(NAME #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, - SIMCInstr; - def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { - let AsmString = psDPP.OpName # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, + SIMCInstr; + def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; - def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { - let AsmString = ps32.OpName # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; + def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPC_Real_with_name op, string OpName, string asm_name, string pseudo_mnemonic = ""> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast(OpName#"_e32"); defvar ps64 = !cast(OpName#"_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : - // 32 and 64 bit forms of the instruction have _e32 and _e64 - // respectively appended to their assembly mnemonic. - // _e64 is printed as part of the VOPDstS64orS32 operand, whereas - // the destination-less 32bit forms add it to the asmString here. - VOPC_Real, - VOPCe, - MnemonicAlias, - Requires<[Gen.AssemblerPredicate]>; - def _e64#Gen.Suffix : - VOP3_Real, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, - MnemonicAlias, - Requires<[Gen.AssemblerPredicate]> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : + // 32 and 64 bit forms of the instruction have _e32 and _e64 + // respectively appended to their assembly mnemonic. + // _e64 is printed as part of the VOPDstS64orS32 operand, whereas + // the destination-less 32bit forms add it to the asmString here. + VOPC_Real, + VOPCe, + MnemonicAlias, + Requires<[Gen.AssemblerPredicate]>; + def _e64#Gen.Suffix : + VOP3_Real, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, + MnemonicAlias, + Requires<[Gen.AssemblerPredicate]> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast(OpName #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC; - def _e32_dpp_w32#Gen.Suffix - : VOPC_DPP16 { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp_w64#Gen.Suffix - : VOPC_DPP16 { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC; + def _e32_dpp_w32#Gen.Suffix + : VOPC_DPP16 { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64#Gen.Suffix + : VOPC_DPP16 { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8; - def _e32_dpp8_w32#Gen.Suffix - : VOPC_DPP8 { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e32_dpp8_w64#Gen.Suffix - : VOPC_DPP8 { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8; + def _e32_dpp8_w32#Gen.Suffix + : VOPC_DPP8 { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64#Gen.Suffix + : VOPC_DPP8 { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast(OpName #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, - SIMCInstr; - def _e64_dpp_w32#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp_w64#Gen.Suffix - : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, + SIMCInstr; + def _e64_dpp_w32#Gen.Suffix + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64#Gen.Suffix + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; - def _e64_dpp8_w32#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc_lo, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - def _e64_dpp8_w64#Gen.Suffix - : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # " vcc, " # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; + def _e64_dpp8_w32#Gen.Suffix + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64#Gen.Suffix + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPC_Real_t16 op, string asm_name, @@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16 op, string asm_name, VOPC_Real_with_name; multiclass VOPCX_Real op> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast(NAME#"_nosdst_e32"); defvar ps64 = !cast(NAME#"_nosdst_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix : - VOPC_Real, - VOPCe { - let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) - # " " # ps32.AsmOperands; - } - def _e64#Gen.Suffix : - VOP3_Real, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - let Inst{7-0} = ?; // sdst - let AsmString = !subst("_nosdst", "", ps64.Mnemonic) - # "{_e64} " # ps64.AsmOperands; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix : + VOPC_Real, + VOPCe { + let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) + # " " # ps32.AsmOperands; + } + def _e64#Gen.Suffix : + VOP3_Real, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", ps64.Mnemonic) + # "{_e64} " # ps64.AsmOperands; + } defm : VOPCXInstAliases; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast(NAME #"_nosdst_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix - : VOPC_DPP16_SIMC { - let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; - } + def _e32_dpp#Gen.Suffix + : VOPC_DPP16_SIMC { + let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8 { - let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; - } + def _e32_dpp8#Gen.Suffix : VOPC_DPP8 { + let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast(NAME #"_nosdst_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix - : VOPC64_DPP16_NoDst<{0, op}, psDPP>, - SIMCInstr { - let AsmString = !subst("_nosdst", "", psDPP.OpName) - # "{_e64_dpp} " # AsmDPP; - } + def _e64_dpp#Gen.Suffix + : VOPC64_DPP16_NoDst<{0, op}, psDPP>, + SIMCInstr { + let AsmString = !subst("_nosdst", "", psDPP.OpName) + # "{_e64_dpp} " # AsmDPP; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { - let AsmString = !subst("_nosdst", "", ps64.OpName) - # "{_e64_dpp} " # AsmDPP8; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { + let AsmString = !subst("_nosdst", "", ps64.OpName) + # "{_e64_dpp} " # AsmDPP8; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPCX_Real_with_name op, string OpName, string asm_name, string pseudo_mnemonic = ""> { - let AssemblerPredicate = Gen.AssemblerPredicate in { + let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in { defvar ps32 = !cast(OpName#"_nosdst_e32"); defvar ps64 = !cast(OpName#"_nosdst_e64"); - let DecoderNamespace = Gen.DecoderNamespace in { - def _e32#Gen.Suffix - : VOPC_Real, - MnemonicAlias, - Requires<[Gen.AssemblerPredicate]>, - VOPCe { - let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; - } - def _e64#Gen.Suffix - : VOP3_Real, - MnemonicAlias, - Requires<[Gen.AssemblerPredicate]>, - VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { - let Inst{7-0} = ? ; // sdst - let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; - } - } // End DecoderNamespace = Gen.DecoderNamespace + def _e32#Gen.Suffix + : VOPC_Real, + MnemonicAlias, + Requires<[Gen.AssemblerPredicate]>, + VOPCe { + let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; + } + def _e64#Gen.Suffix + : VOP3_Real, + MnemonicAlias, + Requires<[Gen.AssemblerPredicate]>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { + let Inst{7-0} = ? ; // sdst + let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; + } defm : VOPCXInstAliases; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast(OpName#"_nosdst_e32"#"_dpp"); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC; - } - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e32_dpp8#Gen.Suffix : VOPC_DPP8; - } + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC; + def _e32_dpp8#Gen.Suffix : VOPC_DPP8; } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast(OpName#"_nosdst_e64"#"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { - def _e64_dpp#Gen.Suffix - : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, - SIMCInstr { - let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; - } + def _e64_dpp#Gen.Suffix + : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, + SIMCInstr { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { - def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { - let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; - } + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; } } - } // AssemblerPredicate = Gen.AssemblerPredicate + } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace } multiclass VOPCX_Real_t16 op, string asm_name, @@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Only in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass VOPC_Real_gfx10 op> { - let DecoderNamespace = "GFX10" in { - def _e32_gfx10 : - VOPC_Real(NAME#"_e32"), SIEncodingFamily.GFX10>, - VOPCe; - def _e64_gfx10 : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, - VOP3a_gfx10<{0, op}, !cast(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = "GFX10" + def _e32_gfx10 : + VOPC_Real(NAME#"_e32"), SIEncodingFamily.GFX10>, + VOPCe; + def _e64_gfx10 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } if !cast(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : @@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in { } multiclass VOPCX_Real_gfx10 op> { - let DecoderNamespace = "GFX10" in { - def _e32_gfx10 : - VOPC_Real(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, - VOPCe { - let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e32").PseudoInstr) - # " " # !cast(NAME#"_nosdst_e32").AsmOperands; - } - - def _e64_gfx10 : - VOP3_Real(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, - VOP3a_gfx10<{0, op}, !cast(NAME#"_nosdst_e64").Pfl> { - let Inst{7-0} = ?; // sdst - let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e64").Mnemonic) - # "{_e64} " # !cast(NAME#"_nosdst_e64").AsmOperands; - } - } // End DecoderNamespace = "GFX10" + def _e32_gfx10 : + VOPC_Real(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>, + VOPCe { + let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e32").PseudoInstr) + # " " # !cast(NAME#"_nosdst_e32").AsmOperands; + } + + def _e64_gfx10 : + VOP3_Real(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>, + VOP3a_gfx10<{0, op}, !cast(NAME#"_nosdst_e64").Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", !cast(NAME#"_nosdst_e64").Mnemonic) + # "{_e64} " # !cast(NAME#"_nosdst_e64").AsmOperands; + } if !cast(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : @@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in { defm : VOPCXInstAliases; } -} // End AssemblerPredicate = isGFX10Only +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>; defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>; @@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16 : VOPCX_Real_gfx10<0x0ff>; // GFX6, GFX7, GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX6GFX7 in { +let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { multiclass VOPC_Real_gfx6_gfx7 op> { - let DecoderNamespace = "GFX6GFX7" in { - def _e32_gfx6_gfx7 : - VOPC_Real(NAME#"_e32"), SIEncodingFamily.SI>, - VOPCe; - def _e64_gfx6_gfx7 : - VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, - VOP3a_gfx6_gfx7(NAME#"_e64").Pfl> { - // Encoding used for VOPC instructions encoded as VOP3 differs from - // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. - bits<8> sdst; - let Inst{7-0} = sdst; - } - } // End DecoderNamespace = "GFX6GFX7" + def _e32_gfx6_gfx7 : + VOPC_Real(NAME#"_e32"), SIEncodingFamily.SI>, + VOPCe; + def _e64_gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3a_gfx6_gfx7(NAME#"_e64").Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } defm : VOPCInstAliases; } -} // End AssemblerPredicate = isGFX6GFX7 +} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" multiclass VOPC_Real_gfx6_gfx7_gfx10 op> : VOPC_Real_gfx6_gfx7, VOPC_Real_gfx10; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 801afabbdb140..2989d05e968ef 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -835,7 +835,7 @@ class VOP_DPP_Pseudo pattern=[], AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); - let DecoderNamespace = "DPP"; + let DecoderNamespace = "GFX8"; VOPProfile Pfl = P; } @@ -906,7 +906,7 @@ class VOP_DPP_Base op, VOP_DPP_Pseudo ps, GFXGen Gen, VOP3_DPP16 { let AssemblerPredicate = Gen.AssemblerPredicate; let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate); - let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + let DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); } @@ -1463,7 +1463,7 @@ multiclass VOP3_Real_dpp_with_name op, string opName, multiclass VOP3_Real_dpp8_Base op, string opName = NAME> { defvar ps = !cast(opName#"_e64"); def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8 { - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1473,7 +1473,7 @@ multiclass VOP3Dot_Real_dpp8_Base op, string opName = NAME> def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8 { let Inst{11} = ?; let Inst{12} = ?; - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1482,7 +1482,7 @@ multiclass VOP3_Real_dpp8_with_name op, string opName, string asmName> { defvar ps = !cast(opName#"_e64"); let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, - DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + DecoderNamespace = Gen.DecoderNamespace# !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"), True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate) in { @@ -1505,7 +1505,7 @@ multiclass VOP3be_Real_dpp op, string opName, defvar dpp_ps = !cast(opName #"_e64" #"_dpp"); def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16, SIMCInstr { - let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } @@ -1514,7 +1514,7 @@ multiclass VOP3be_Real_dpp8 op, string opName, string asmName> { defvar ps = !cast(opName #"_e64"); def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base { - let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let DecoderNamespace = Gen.DecoderNamespace; let AssemblerPredicate = Gen.AssemblerPredicate; } } From f17e4151423a798c18533080fe7f8a3e922d7312 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Thu, 22 Feb 2024 11:36:18 +0000 Subject: [PATCH 205/351] [AArch64] Mangle names of all ARM64EC functions with entry thunks (#80996) This better matches MSVC output in cases where static functions have their addresses taken. --- llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 3 ++- .../CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index c62582ac01a4c..a99856dcc9439 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction( // name (emitting the definition) can grab it from the metadata. // // FIXME: Handle functions with weak linkage? - if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) { + if (!F.hasLocalLinkage() || F.hasAddressTaken()) { if (std::optional MangledName = getArm64ECMangledFunctionName(F.getName().str())) { F.setMetadata("arm64ec_unmangled_name", diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 5b5ffd7b2feb0..4fa719ad67cf3 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { TS->emitDirectiveVariantPCS(CurrentFnSym); } - if (TM.getTargetTriple().isWindowsArm64EC()) { + if (TM.getTargetTriple().isWindowsArm64EC() && + !MF->getFunction().hasLocalLinkage()) { // For ARM64EC targets, a function definition's name is mangled differently // from the normal symbol. We emit the alias from the unmangled symbol to // mangled symbol name here. diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll index 00ae34bf4b00f..217f08be05218 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll @@ -2,7 +2,8 @@ ; Validates when local linkage functions get a thunk generated. -; Being called does not cause a thunk to be generated. +; Being called does not cause a thunk to be generated or the symbol name to be mangled. +; CHECK-NOT: "#does_not_have_addr_taken": ; CHECK-NOT: $ientry_thunk$cdecl$v$f; define internal void @does_not_have_addr_taken(float) nounwind { ret void @@ -12,7 +13,8 @@ define void @calls_does_not_have_addr_taken() nounwind { ret void } -; Having an address taken does cause a thunk to be generated. +; Having an address taken does cause a thunk to be generated and the symbol name to be mangled. +; CHECK: "#has_addr_taken": ; CHECK: $ientry_thunk$cdecl$v$i8; define internal void @has_addr_taken(i64) nounwind { ret void From 1f99a450127c2404d4f9b8ac24acdb17823c988b Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 20 Feb 2024 15:08:06 +0000 Subject: [PATCH 206/351] [AArch64] Remove unused ReverseCSRRestoreSeq option. (#82326) This patch removes the `-reverse-csr-restore-seq` option from AArch64FrameLowering, since this is no longer used. This patch was reverted because of a crash in PR#79623. Merging it back as it was fixed in PR#82492. --- .../Target/AArch64/AArch64FrameLowering.cpp | 66 ++++-------- .../AArch64/reverse-csr-restore-seq.mir | 101 ------------------ 2 files changed, 21 insertions(+), 146 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 503b1c199650f..5cc612e89162a 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -239,11 +239,6 @@ static cl::opt EnableRedZone("aarch64-redzone", cl::desc("enable use of redzone on AArch64"), cl::init(false), cl::Hidden); -static cl::opt - ReverseCSRRestoreSeq("reverse-csr-restore-seq", - cl::desc("reverse the CSR restore sequence"), - cl::init(false), cl::Hidden); - static cl::opt StackTaggingMergeSetTag( "stack-tagging-merge-settag", cl::desc("merge settag instruction in function epilog"), cl::init(true), @@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog( return false; if (!EnableHomogeneousPrologEpilog) return false; - if (ReverseCSRRestoreSeq) - return false; if (EnableRedZone) return false; @@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); - auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator { + if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) + .setMIFlag(MachineInstr::FrameDestroy); + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Define); + MIB.addReg(RPI.Reg2, RegState::Define); + } + return true; + } + + // For performance reasons restore SVE register in increasing order + auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; + auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); + auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); + std::reverse(PPRBegin, PPREnd); + auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; + auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); + auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); + std::reverse(ZPRBegin, ZPREnd); + + for (const RegPairInfo &RPI : RegPairs) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -3191,43 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); - - return MIB->getIterator(); - }; - - if (homogeneousPrologEpilog(MF, &MBB)) { - auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) - .setMIFlag(MachineInstr::FrameDestroy); - for (auto &RPI : RegPairs) { - MIB.addReg(RPI.Reg1, RegState::Define); - MIB.addReg(RPI.Reg2, RegState::Define); - } - return true; - } - - // For performance reasons restore SVE register in increasing order - auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; }; - auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); - auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR); - std::reverse(PPRBegin, PPREnd); - auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; }; - auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); - auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); - std::reverse(ZPRBegin, ZPREnd); - - if (ReverseCSRRestoreSeq) { - MachineBasicBlock::iterator First = MBB.end(); - for (const RegPairInfo &RPI : reverse(RegPairs)) { - MachineBasicBlock::iterator It = EmitMI(RPI); - if (First == MBB.end()) - First = It; - } - if (First != MBB.end()) - MBB.splice(MBBI, &MBB, First); - } else { - for (const RegPairInfo &RPI : RegPairs) { - (void)EmitMI(RPI); - } } return true; diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir deleted file mode 100644 index de4baec50e0c6..0000000000000 --- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir +++ /dev/null @@ -1,101 +0,0 @@ -# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK -# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK -# ---- | - - define void @foo() nounwind { entry: unreachable } - - define void @bar() nounwind { entry: unreachable } - - define void @baz() nounwind { entry: unreachable } - -... ---- -name: foo -# CHECK-LABEL: name: foo -tracksRegLiveness: true -body: | - bb.0: - $x19 = IMPLICIT_DEF - $x20 = IMPLICIT_DEF - $x21 = IMPLICIT_DEF - $x22 = IMPLICIT_DEF - $x23 = IMPLICIT_DEF - $x24 = IMPLICIT_DEF - $x25 = IMPLICIT_DEF - $x26 = IMPLICIT_DEF - - ; The local stack size is 0, so the last ldp in the sequence will also - ; restore the stack. - ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2 - ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4 - ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6 - - ; The ldp and the stack increment get merged even before - ; the load-store optimizer. - ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8 - - RET_ReallyLR -... ---- -name: bar -# CHECK-LABEL: name: bar -tracksRegLiveness: true -stack: - - { id : 0, size: 8, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - local-offset: -4, debug-info-variable: '', debug-info-expression: '', - debug-info-location: '' } - -body: | - bb.0: - $x19 = IMPLICIT_DEF - $x20 = IMPLICIT_DEF - $x21 = IMPLICIT_DEF - $x22 = IMPLICIT_DEF - $x23 = IMPLICIT_DEF - $x24 = IMPLICIT_DEF - $x25 = IMPLICIT_DEF - $x26 = IMPLICIT_DEF - - ; The local stack size is not 0, and we can combine the CSR stack size with - ; the local stack size. This results in rewriting the offsets for all the - ; save/restores and forbids us to merge the stack adjustment and the last pop. - ; In this case, there is no point of moving the first CSR pair at the end. - ; We do it anyway, as it's a small price to pay for the resulting - ; simplification in the epilogue emission code. - ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 4 - ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6 - ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8 - ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0 - RET_ReallyLR -... ---- -# Check that the load from the offset 0 is moved at the end even when hasFP is -# false. -name: baz -# CHECK-LABEL: name: baz -alignment: 4 -tracksRegLiveness: true -frameInfo: - adjustsStack: true - hasCalls: true -body: | - bb.0: - successors: %bb.1 - - $x0 = IMPLICIT_DEF - $x20 = IMPLICIT_DEF - $x21 = IMPLICIT_DEF - - ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp - BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0 - ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - B %bb.1 - - bb.1: - ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2 - ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32 - RET_ReallyLR -... From 4235e44d4c37ca738c74def05da8caf124d2464e Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 22 Feb 2024 13:15:26 +0100 Subject: [PATCH 207/351] [GlobalISel] Constant-fold G_PTR_ADD with different type sizes (#81473) All other opcodes in the list are constrained to have the same type on both operands, but not G_PTR_ADD. Fixes #81464 --- llvm/lib/CodeGen/GlobalISel/Utils.cpp | 5 ++- .../combine-extract-vector-load.mir | 40 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 26fd12f9e51c4..23ad68b331c97 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -660,8 +660,11 @@ std::optional llvm::ConstantFoldBinOp(unsigned Opcode, default: break; case TargetOpcode::G_ADD: - case TargetOpcode::G_PTR_ADD: return C1 + C2; + case TargetOpcode::G_PTR_ADD: + // Types can be of different width here. + // Result needs to be the same width as C1, so trunc or sext C2. + return C1 + C2.sextOrTrunc(C1.getBitWidth()); case TargetOpcode::G_AND: return C1 & C2; case TargetOpcode::G_ASHR: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir new file mode 100644 index 0000000000000..aa72a9ec06ede --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +# Tries to emit a foldable G_PTR_ADD with (p1, s32) operands. +--- +name: test_ptradd_crash__offset_smaller +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: test_ptradd_crash__offset_smaller + ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1) + ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + %1:_(p1) = G_CONSTANT i64 0 + %3:_(s32) = G_CONSTANT i32 3 + %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1) + %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3 + $sgpr0 = COPY %2 + SI_RETURN_TO_EPILOG implicit $sgpr0 +... + +# Tries to emit a foldable G_PTR_ADD with (p1, s128) operands. +--- +name: test_ptradd_crash__offset_wider +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: test_ptradd_crash__offset_wider + ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1) + ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + %1:_(p1) = G_CONSTANT i64 0 + %3:_(s128) = G_CONSTANT i128 3 + %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1) + %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3 + $sgpr0 = COPY %2 + SI_RETURN_TO_EPILOG implicit $sgpr0 +... From 3ef63a71adb7fd1c792fd61d00c74159fcef9a2f Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 22 Feb 2024 20:57:34 +0800 Subject: [PATCH 208/351] [CVP] Refactor `processMinMaxIntrinsic` to check non-strict predicate in both directions (#82596) This patch uses `getConstantRangeAtUse` in `processMinMaxIntrinsic` to address the comment https://github.com/llvm/llvm-project/pull/82478#discussion_r1497300920. After this patch we can reuse the range result in https://github.com/llvm/llvm-project/pull/82478. --- .../Scalar/CorrelatedValuePropagation.cpp | 26 +++++--- .../CorrelatedValuePropagation/min-max.ll | 63 +++++++++++++++++-- 2 files changed, 76 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 9235850de92f3..c71870bc1b656 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -530,15 +530,23 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) { // See if this min/max intrinsic always picks it's one specific operand. static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) { CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate()); - LazyValueInfo::Tristate Result = LVI->getPredicateAt( - Pred, MM->getLHS(), MM->getRHS(), MM, /*UseBlockValue=*/true); - if (Result == LazyValueInfo::Unknown) - return false; - - ++NumMinMax; - MM->replaceAllUsesWith(MM->getOperand(!Result)); - MM->eraseFromParent(); - return true; + ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0), + /*UndefAllowed*/ false); + ConstantRange RHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(1), + /*UndefAllowed*/ false); + if (LHS_CR.icmp(Pred, RHS_CR)) { + ++NumMinMax; + MM->replaceAllUsesWith(MM->getLHS()); + MM->eraseFromParent(); + return true; + } + if (RHS_CR.icmp(Pred, LHS_CR)) { + ++NumMinMax; + MM->replaceAllUsesWith(MM->getRHS()); + MM->eraseFromParent(); + return true; + } + return false; } // Rewrite this with.overflow intrinsic as non-overflowing. diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll index 705b6e96fe9e3..d21b8f2418c2e 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll @@ -71,7 +71,6 @@ define i8 @test6(i8 %x) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: [[LIM:%.*]] = icmp uge i8 [[X:%.*]], 42 ; CHECK-NEXT: call void @llvm.assume(i1 [[LIM]]) -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 42) ; CHECK-NEXT: ret i8 42 ; %lim = icmp uge i8 %x, 42 @@ -119,7 +118,6 @@ define i8 @test10(i8 %x) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[LIM:%.*]] = icmp ule i8 [[X:%.*]], 42 ; CHECK-NEXT: call void @llvm.assume(i1 [[LIM]]) -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 42) ; CHECK-NEXT: ret i8 42 ; %lim = icmp ule i8 %x, 42 @@ -167,7 +165,6 @@ define i8 @test14(i8 %x) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 42 ; CHECK-NEXT: call void @llvm.assume(i1 [[LIM]]) -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 42) ; CHECK-NEXT: ret i8 42 ; %lim = icmp sge i8 %x, 42 @@ -215,7 +212,6 @@ define i8 @test18(i8 %x) { ; CHECK-LABEL: @test18( ; CHECK-NEXT: [[LIM:%.*]] = icmp sle i8 [[X:%.*]], 42 ; CHECK-NEXT: call void @llvm.assume(i1 [[LIM]]) -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 42) ; CHECK-NEXT: ret i8 42 ; %lim = icmp sle i8 %x, 42 @@ -235,3 +231,62 @@ define i8 @test19(i8 %x) { %r = call i8 @llvm.smax(i8 %x, i8 42) ret i8 %r } + +declare void @body(i32) + +define void @test_bidirectional() { +; CHECK-LABEL: @test_bidirectional( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: call void @body(i32 65535) +; CHECK-NEXT: [[INC]] = add nsw i32 [[INDVAR]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INDVAR]], 65535 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvar = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %smax = call i32 @llvm.smax.i32(i32 %indvar, i32 65535) + call void @body(i32 %smax) + %inc = add nsw i32 %indvar, 1 + %cmp = icmp slt i32 %indvar, 65535 + br i1 %cmp, label %for.body, label %exit + +exit: + ret void +} + +define i64 @test_at_use(i1 %cond, i64 %x) { +; CHECK-LABEL: @test_at_use( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[IF_END:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[X:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK: if.then: +; CHECK-NEXT: ret i64 0 +; CHECK: if.end: +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[X]], [[BB1]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i64 [[PHI]] +; +entry: + br i1 %cond, label %bb1, label %if.end + +bb1: + %val = call i64 @llvm.smax.i64(i64 %x, i64 -1) + %cmp = icmp slt i64 %x, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + ret i64 0 + +if.end: + %phi = phi i64 [%val, %bb1], [0, %entry] + ret i64 %phi +} From c831d83bb17caa3a8f137052559cb6c54b21b7c1 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Thu, 22 Feb 2024 13:59:04 +0100 Subject: [PATCH 209/351] [InferAddrSpaces] Correctly replace identical operands of insts (#82610) It's important for PHI nodes because if a PHI node has multiple edges coming from the same block, we can have the same incoming value multiple times in the list of incoming values. All of those need to be consistent (exact same Value*) otherwise verifier complains. Fixes SWDEV-445797 --- .../Transforms/Scalar/InferAddressSpaces.cpp | 13 ++-- .../AMDGPU/multiple-uses-of-val.ll | 69 +++++++++++++++++++ 2 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 1bf50d79e5331..851eab04c8dbb 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -1221,6 +1221,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( Value::use_iterator I, E, Next; for (I = V->use_begin(), E = V->use_end(); I != E;) { Use &U = *I; + User *CurUser = U.getUser(); // Some users may see the same pointer operand in multiple operands. Skip // to the next instruction. @@ -1231,11 +1232,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. - U.set(NewV); + CurUser->replaceUsesOfWith(V, NewV); continue; } - User *CurUser = U.getUser(); // Skip if the current user is the new value itself. if (CurUser == NewV) continue; @@ -1311,10 +1311,13 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces( while (isa(InsertPos)) ++InsertPos; - U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + // This instruction may contain multiple uses of V, update them all. + CurUser->replaceUsesOfWith( + V, new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); } else { - U.set(ConstantExpr::getAddrSpaceCast(cast(NewV), - V->getType())); + CurUser->replaceUsesOfWith( + V, ConstantExpr::getAddrSpaceCast(cast(NewV), + V->getType())); } } } diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll new file mode 100644 index 0000000000000..717bd09897732 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=infer-address-spaces --verify-each %s | FileCheck %s + +; Inst can use a value multiple time. When we're inserting an addrspacecast to flat, +; it's important all the identical uses use an indentical replacement, especially +; for PHIs. + +define amdgpu_kernel void @test_phi() { +; CHECK-LABEL: @test_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOADED_PTR:%.*]] = load ptr, ptr addrspace(4) null, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[LOADED_PTR]] to ptr addrspace(1) +; CHECK-NEXT: br label [[BB0:%.*]] +; CHECK: bb0: +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP0]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[GEP]] to ptr +; CHECK-NEXT: switch i32 0, label [[END:%.*]] [ +; CHECK-NEXT: i32 1, label [[END]] +; CHECK-NEXT: i32 4, label [[END]] +; CHECK-NEXT: i32 5, label [[BB1:%.*]] +; CHECK-NEXT: ] +; CHECK: bb1: +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr addrspace(1) [[GEP]], align 16 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RETVAL_SROA_0_0_I569_PH:%.*]] = phi ptr [ null, [[BB1]] ], [ [[TMP1]], [[BB0]] ], [ [[TMP1]], [[BB0]] ], [ [[TMP1]], [[BB0]] ] +; CHECK-NEXT: ret void +; +entry: + %loaded.ptr = load ptr, ptr addrspace(4) null, align 8 + br label %bb0 + +bb0: + %gep = getelementptr i64, ptr %loaded.ptr, i64 3 + switch i32 0, label %end [ + i32 1, label %end + i32 4, label %end + i32 5, label %bb1 + ] + +bb1: + %0 = load double, ptr %gep, align 16 + br label %end + +end: + %retval.sroa.0.0.i569.ph = phi ptr [ null, %bb1 ], [ %gep, %bb0 ], [ %gep, %bb0 ], [ %gep, %bb0 ] + ret void +} + +declare void @uses_ptrs(ptr, ptr, ptr) + +; We shouldn't treat PHIs differently, even other users should have the same treatment. +; All occurences of %gep are replaced with an identical value. +define amdgpu_kernel void @test_other() { +; CHECK-LABEL: @test_other( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOADED_PTR:%.*]] = load ptr, ptr addrspace(4) null, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[LOADED_PTR]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[TMP1]], i64 3 +; CHECK-NEXT: call void @uses_ptrs(ptr [[GEP]], ptr [[GEP]], ptr [[GEP]]) +; CHECK-NEXT: ret void +; +entry: + %loaded.ptr = load ptr, ptr addrspace(4) null, align 8 + %gep = getelementptr i64, ptr %loaded.ptr, i64 3 + call void @uses_ptrs(ptr %gep, ptr %gep, ptr %gep) + ret void +} From 73c646a3b27293f8cb4ba120de7bc01c223b4b5f Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 22 Feb 2024 12:58:10 +0000 Subject: [PATCH 210/351] [flang] Fix warning when with clang-cl/msvc \llvm\flang\lib\Evaluate\fold-integer.cpp(705,35): warning: lambda capture 'FromInt64' is not used [-Wunused-lambda-capture] It is intentionally unused. --- flang/lib/Evaluate/fold-integer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp index 0e8706e0f2740..09b2f91debda2 100644 --- a/flang/lib/Evaluate/fold-integer.cpp +++ b/flang/lib/Evaluate/fold-integer.cpp @@ -719,6 +719,7 @@ Expr> FoldIntrinsicFunction( // CharacterUtils<2>::ICHAR(). Can't find a work-around, // so remove the FromInt64 error checking lambda that // seems to have caused the proble. + (void)FromInt64; [](const Scalar &c) { return CharacterUtils::ICHAR( CharacterUtils::Resize(c, 1)); From 18f116651af0e328e6f9f6b0619171bd8a2c4817 Mon Sep 17 00:00:00 2001 From: pwprzybyla <121295298+pwprzybyla@users.noreply.github.com> Date: Thu, 22 Feb 2024 14:04:21 +0100 Subject: [PATCH 211/351] Multilib support for libraries with exceptions (#75031) For better multilib matching explicitly match -fno-rtti and -fno-exceptions --- clang/include/clang/Driver/ToolChain.h | 10 ++++++++++ clang/lib/Driver/ToolChain.cpp | 23 ++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index 2d0c1f826c172..fbe2e8fe8e88d 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -120,6 +120,11 @@ class ToolChain { RM_Disabled, }; + enum ExceptionsMode { + EM_Enabled, + EM_Disabled, + }; + struct BitCodeLibraryInfo { std::string Path; bool ShouldInternalize; @@ -141,6 +146,8 @@ class ToolChain { const RTTIMode CachedRTTIMode; + const ExceptionsMode CachedExceptionsMode; + /// The list of toolchain specific path prefixes to search for libraries. path_list LibraryPaths; @@ -318,6 +325,9 @@ class ToolChain { // Returns the RTTIMode for the toolchain with the current arguments. RTTIMode getRTTIMode() const { return CachedRTTIMode; } + // Returns the ExceptionsMode for the toolchain with the current arguments. + ExceptionsMode getExceptionsMode() const { return CachedExceptionsMode; } + /// Return any implicit target and/or mode flag for an invocation of /// the compiler driver as `ProgName`. /// diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 388030592b483..f8c13c86daf9b 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -77,10 +77,19 @@ static ToolChain::RTTIMode CalculateRTTIMode(const ArgList &Args, return NoRTTI ? ToolChain::RM_Disabled : ToolChain::RM_Enabled; } +static ToolChain::ExceptionsMode CalculateExceptionsMode(const ArgList &Args) { + if (Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions, + true)) { + return ToolChain::EM_Enabled; + } + return ToolChain::EM_Disabled; +} + ToolChain::ToolChain(const Driver &D, const llvm::Triple &T, const ArgList &Args) : D(D), Triple(T), Args(Args), CachedRTTIArg(GetRTTIArgument(Args)), - CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)) { + CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)), + CachedExceptionsMode(CalculateExceptionsMode(Args)) { auto addIfExists = [this](path_list &List, const std::string &Path) { if (getVFS().exists(Path)) List.push_back(Path); @@ -264,6 +273,18 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { break; } + // Include fno-exceptions and fno-rtti + // to improve multilib selection + if (getRTTIMode() == ToolChain::RTTIMode::RM_Disabled) + Result.push_back("-fno-rtti"); + else + Result.push_back("-frtti"); + + if (getExceptionsMode() == ToolChain::ExceptionsMode::EM_Disabled) + Result.push_back("-fno-exceptions"); + else + Result.push_back("-fexceptions"); + // Sort and remove duplicates. std::sort(Result.begin(), Result.end()); Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); From b47f63d3c8fedf7c98b7f58e892e784fddee4601 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Thu, 22 Feb 2024 13:07:31 +0000 Subject: [PATCH 212/351] [Clang][SME] Detect always_inline used with mismatched streaming attributes (#77936) This patch adds an error that is emitted when a streaming function is marked as always_inline and is called from a non-streaming function. --- .../clang/Basic/DiagnosticFrontendKinds.td | 4 ++ clang/lib/CodeGen/Targets/AArch64.cpp | 43 +++++++++++++++++ .../aarch64-sme-inline-streaming-attrs.c | 47 +++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td index b1a282f5164a2..dcd2c19fb7ee3 100644 --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -279,6 +279,10 @@ def err_builtin_needs_feature : Error<"%0 needs target feature %1">; def err_function_needs_feature : Error< "always_inline function %1 requires target feature '%2', but would " "be inlined into function %0 that is compiled without support for '%2'">; +def err_function_always_inline_attribute_mismatch : Error< + "always_inline function %1 and its caller %0 have mismatching %2 attributes">; +def err_function_always_inline_new_za : Error< + "always_inline function %0 has new za state">; def warn_avx_calling_convention : Warning<"AVX vector %select{return|argument}0 of type %1 without '%2' " diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index ee7f95084d2e0..94f8e7be2ee6e 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/DiagnosticFrontend.h" using namespace clang; using namespace clang::CodeGen; @@ -155,6 +156,11 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo { } return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty); } + + void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, + const FunctionDecl *Caller, + const FunctionDecl *Callee, + const CallArgList &Args) const override; }; class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo { @@ -814,6 +820,43 @@ Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr, /*allowHigherAlign*/ false); } +static bool isStreaming(const FunctionDecl *F) { + if (F->hasAttr()) + return true; + if (const auto *T = F->getType()->getAs()) + return T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask; + return false; +} + +static bool isStreamingCompatible(const FunctionDecl *F) { + if (const auto *T = F->getType()->getAs()) + return T->getAArch64SMEAttributes() & + FunctionType::SME_PStateSMCompatibleMask; + return false; +} + +void AArch64TargetCodeGenInfo::checkFunctionCallABI( + CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller, + const FunctionDecl *Callee, const CallArgList &Args) const { + if (!Caller || !Callee || !Callee->hasAttr()) + return; + + bool CallerIsStreaming = isStreaming(Caller); + bool CalleeIsStreaming = isStreaming(Callee); + bool CallerIsStreamingCompatible = isStreamingCompatible(Caller); + bool CalleeIsStreamingCompatible = isStreamingCompatible(Callee); + + if (!CalleeIsStreamingCompatible && + (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible)) + CGM.getDiags().Report(CallLoc, + diag::err_function_always_inline_attribute_mismatch) + << Caller->getDeclName() << Callee->getDeclName() << "streaming"; + if (auto *NewAttr = Callee->getAttr()) + if (NewAttr->isNewZA()) + CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za) + << Callee->getDeclName(); +} + std::unique_ptr CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM, AArch64ABIKind Kind) { diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c new file mode 100644 index 0000000000000..7eb74f28a1c85 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_NONE %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_COMPATIBLE %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_STREAMING %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_LOCALLY %s + +#define __ai __attribute__((always_inline)) +__ai void inlined_fn(void) {} +__ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {} +__ai void inlined_fn_streaming(void) __arm_streaming {} +__ai __arm_locally_streaming void inlined_fn_local(void) {} + +#ifdef TEST_NONE +void caller(void) { + inlined_fn(); + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller' have mismatching streaming attributes}} + inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller' have mismatching streaming attributes}} +} +#endif + +#ifdef TEST_COMPATIBLE +void caller_compatible(void) __arm_streaming_compatible { + inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}} + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}} + inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}} +} +#endif + +#ifdef TEST_STREAMING +void caller_streaming(void) __arm_streaming { + inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}} + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); + inlined_fn_local(); +} +#endif + +#ifdef TEST_LOCALLY +__arm_locally_streaming +void caller_local(void) { + inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}} + inlined_fn_streaming_compatible(); + inlined_fn_streaming(); + inlined_fn_local(); +} +#endif From fa8a21144ec9a6836e9bf1e3bf5cd0b2f058209e Mon Sep 17 00:00:00 2001 From: NagyDonat Date: Thu, 22 Feb 2024 14:19:20 +0100 Subject: [PATCH 213/351] [analyzer] Improve handling of unsigned values in ArrayBoundCheckerV2 (#81034) A memory access is an out of bounds error if the offset is < the extent of the memory region. Notice that here "<" is a _mathematical_ comparison between two numbers and NOT a C/C++ operator that compares two typed C++ values: for example -1 < 1000 is true in mathematics, but if the `-1` is an `int` and the `1000` is a `size_t` value, then evaluating the C/C++ operator `<` will return false because the `-1` will be converted to `SIZE_MAX` by the automatic type conversions. This means that it's incorrect to perform a bounds check with `evalBinOpNN(State, BO_LT, ...)` which performs automatic conversions and can produce wildly incorrect results. ArrayBoundsCheckerV2 already had a special case where it avoided calling `evalBinOpNN` in a situation where it would have performed an automatic conversion; this commit replaces that code with a more general one that covers more situations. (It's still not perfect, but it's better than the previous version and I think it will cover practically all real-world code.) Note that this is not a limitation/bug of the simplification algorithm defined in `getSimplifedOffsets()`: the simplification is not applied in the test case `test_comparison_with_extent_symbol` (because the `Extent` is not a concrete int), but without the new code it would still run into a `-1 < UNSIGNED` comparison that evaluates to false because `evalBinOpNN` performs an automatic type conversion. --- .../Core/PathSensitive/SValBuilder.h | 12 ++++-- .../Checkers/ArrayBoundCheckerV2.cpp | 42 +++++++++++++++---- clang/test/Analysis/out-of-bounds.c | 8 ++++ 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h index d7cff49036cb8..a560f274c43cc 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h @@ -110,12 +110,16 @@ class SValBuilder { /// that value is returned. Otherwise, returns NULL. virtual const llvm::APSInt *getKnownValue(ProgramStateRef state, SVal val) = 0; - /// Tries to get the minimal possible (integer) value of a given SVal. If the - /// constraint manager cannot provide an useful answer, this returns NULL. + /// Tries to get the minimal possible (integer) value of a given SVal. This + /// always returns the value of a ConcreteInt, but may return NULL if the + /// value is symbolic and the constraint manager cannot provide a useful + /// answer. virtual const llvm::APSInt *getMinValue(ProgramStateRef state, SVal val) = 0; - /// Tries to get the maximal possible (integer) value of a given SVal. If the - /// constraint manager cannot provide an useful answer, this returns NULL. + /// Tries to get the maximal possible (integer) value of a given SVal. This + /// always returns the value of a ConcreteInt, but may return NULL if the + /// value is symbolic and the constraint manager cannot provide a useful + /// answer. virtual const llvm::APSInt *getMaxValue(ProgramStateRef state, SVal val) = 0; /// Simplify symbolic expressions within a given SVal. Return an SVal diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp index 05fc00a990d52..fdcc46e58580b 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp @@ -268,6 +268,16 @@ getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, return std::pair(offset, extent); } +static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) { + const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value); + return MaxV && MaxV->isNegative(); +} + +static bool isUnsigned(SValBuilder &SVB, NonLoc Value) { + QualType T = Value.getType(SVB.getContext()); + return T->isUnsignedIntegerType(); +} + // Evaluate the comparison Value < Threshold with the help of the custom // simplification algorithm defined for this checker. Return a pair of states, // where the first one corresponds to "value below threshold" and the second @@ -281,18 +291,32 @@ compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, if (auto ConcreteThreshold = Threshold.getAs()) { std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); } - if (auto ConcreteThreshold = Threshold.getAs()) { - QualType T = Value.getType(SVB.getContext()); - if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) { - // In this case we reduced the bound check to a comparison of the form - // (symbol or value with unsigned type) < (negative number) - // which is always false. We are handling these cases separately because - // evalBinOpNN can perform a signed->unsigned conversion that turns the - // negative number into a huge positive value and leads to wildly - // inaccurate conclusions. + + // We want to perform a _mathematical_ comparison between the numbers `Value` + // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may + // perform automatic conversions. For example the number -1 is less than the + // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int` + // -1 is converted to ULONGLONG_MAX. + // To avoid automatic conversions, we evaluate the "obvious" cases without + // calling `evalBinOpNN`: + if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) { + if (CheckEquality) { + // negative_value == unsigned_value is always false return {nullptr, State}; } + // negative_value < unsigned_value is always false + return {State, nullptr}; } + if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) { + // unsigned_value == negative_value and unsigned_value < negative_value are + // both always false + return {nullptr, State}; + } + // FIXME: these special cases are sufficient for handling real-world + // comparisons, but in theory there could be contrived situations where + // automatic conversion of a symbolic value (which can be negative and can be + // positive) leads to incorrect results. + const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT; auto BelowThreshold = SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType()) diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c index ed457e8696006..1f771c2b3bd13 100644 --- a/clang/test/Analysis/out-of-bounds.c +++ b/clang/test/Analysis/out-of-bounds.c @@ -186,3 +186,11 @@ void test_assume_after_access2(unsigned long x) { clang_analyzer_eval(x <= 99); // expected-warning{{TRUE}} } +struct incomplete; +char test_comparison_with_extent_symbol(struct incomplete *p) { + // Previously this was reported as a (false positive) overflow error because + // the extent symbol of the area pointed by `p` was an unsigned and the '-1' + // was converted to its type by `evalBinOpNN`. + return ((char *)p)[-1]; // no-warning +} + From afa8a2eed0c4ca61ac19abd88022e63e58408af1 Mon Sep 17 00:00:00 2001 From: NagyDonat Date: Thu, 22 Feb 2024 14:29:05 +0100 Subject: [PATCH 214/351] [analyzer] Remove superfluous #include "CallDescription.h" (NFC) (#82614) To fix https://github.com/llvm/llvm-project/issues/81597, I'm planning to refactor the usage of CallDescription; and as I was preparing for this I noticed that there are two superfluous references to this header. --- clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp | 2 +- clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp index 265185e641072..18e718e085536 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp @@ -17,7 +17,7 @@ #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" #include "clang/StaticAnalyzer/Core/Checker.h" #include "clang/StaticAnalyzer/Core/CheckerManager.h" -#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h" #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h" #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h" diff --git a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h index 6de33da107a3f..dec461296fed5 100644 --- a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h +++ b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h @@ -13,7 +13,6 @@ #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" #include "clang/StaticAnalyzer/Core/Checker.h" #include "clang/StaticAnalyzer/Core/CheckerManager.h" -#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h" #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h" #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" #include "llvm/ADT/FoldingSet.h" @@ -96,4 +95,4 @@ void handleConstructorAndAssignment(const CallEvent &Call, CheckerContext &C, } // namespace clang::ento::tagged_union_modeling -#endif // LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_TAGGEDUNIONMODELING_H \ No newline at end of file +#endif // LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_TAGGEDUNIONMODELING_H From 770fd3856660fea6cbaa78d9cb1f03cc92611783 Mon Sep 17 00:00:00 2001 From: Ian Hickson Date: Thu, 22 Feb 2024 05:35:23 -0800 Subject: [PATCH 215/351] [LangRef] Document string literals in LLVM's format (#82529) --- llvm/docs/LangRef.rst | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index fd2e3aacd0169..8f4495e25d0fa 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -61,10 +61,13 @@ run by the parser after parsing input assembly and by the optimizer before it outputs bitcode. The violations pointed out by the verifier pass indicate bugs in transformation passes or input to the parser. +Syntax +====== + .. _identifiers: Identifiers -=========== +----------- LLVM identifiers come in two basic types: global and local. Global identifiers (functions, global variables) begin with the ``'@'`` @@ -140,6 +143,34 @@ It also shows a convention that we follow in this document. When demonstrating instructions, we will follow an instruction with a comment that defines the type and name of value produced. +.. _strings: + +String constants +---------------- + +Strings in LLVM programs are delimited by ``"`` characters. Within a +string, all bytes are treated literally with the exception of ``\`` +characters, which start escapes, and the first ``"`` character, which +ends the string. + +There are two kinds of escapes. + +* ``\\`` represents a single ``\`` character. + +* ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F) + represents the byte with the given value (e.g. \x00 represents a + null byte). + +To represent a ``"`` character, use ``\22``. (``\"`` will end the string +with a trailing ``\``.) + +Newlines do not terminate string constants; strings can span multiple +lines. + +The interpretation of string constants (e.g. their character encoding) +depends on context. + + High Level Structure ==================== From 5b8e5604c297aa8fd09bf641d12d0a663e0ea801 Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Thu, 22 Feb 2024 08:46:08 -0500 Subject: [PATCH 216/351] [AIX] Lower intrinsic __builtin_cpu_is into AIX platform-specific code. (#80069) On AIX OS, __builtin_cpu_is() references the runtime external variable _system_configuration from /usr/include/sys/systemcfg.h. ref issue: https://github.com/llvm/llvm-project/issues/80042 --- .../clang/Basic/DiagnosticSemaKinds.td | 2 + clang/lib/Basic/Targets/PPC.cpp | 10 +++ clang/lib/Basic/Targets/PPC.h | 10 ++- clang/lib/CodeGen/CGBuiltin.cpp | 47 ++++++++++++ clang/lib/Sema/SemaChecking.cpp | 5 +- clang/test/CodeGen/aix-builtin-cpu-is.c | 71 +++++++++++++++++++ clang/test/Sema/aix-builtin-cpu-unsupports.c | 6 ++ .../llvm/TargetParser/PPCTargetParser.def | 57 +++++++++++++++ 8 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/aix-builtin-cpu-is.c create mode 100644 clang/test/Sema/aix-builtin-cpu-unsupports.c diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 11411883e1bfc..a96f69d6ac760 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10364,6 +10364,8 @@ def err_x86_builtin_tile_arg_duplicate : Error< def err_builtin_target_unsupported : Error< "builtin is not supported on this target">; +def err_builtin_aix_os_unsupported : Error< + "this builtin is available only on AIX 7.2 and later operating systems">; def err_builtin_longjmp_unsupported : Error< "__builtin_longjmp is not supported for the current target">; def err_builtin_setjmp_unsupported : Error< diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index 8c891ccdeb59d..aebe51bfa4daa 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -904,6 +904,16 @@ bool PPCTargetInfo::validateCpuSupports(StringRef FeatureStr) const { } bool PPCTargetInfo::validateCpuIs(StringRef CPUName) const { + llvm::Triple Triple = getTriple(); + if (Triple.isOSAIX()) { +#define PPC_AIX_CPU(NAME, SUPPORT, INDEX, OP, VALUE) .Case(NAME, true) + return llvm::StringSwitch(CPUName) +#include "llvm/TargetParser/PPCTargetParser.def" + .Default(false); + } + + assert(Triple.isOSLinux() && + "__builtin_cpu_is() is only supported for AIX and Linux."); #define PPC_LNX_CPU(NAME, NUM) .Case(NAME, true) return llvm::StringSwitch(CPUName) #include "llvm/TargetParser/PPCTargetParser.def" diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index a91bdede53e40..70683916a8b04 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -362,8 +362,16 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { // We support __builtin_cpu_supports/__builtin_cpu_is on targets that // have Glibc since it is Glibc that provides the HWCAP[2] in the auxv. + static constexpr int MINIMUM_AIX_OS_MAJOR = 7; + static constexpr int MINIMUM_AIX_OS_MINOR = 2; bool supportsCpuSupports() const override { return getTriple().isOSGlibc(); } - bool supportsCpuIs() const override { return getTriple().isOSGlibc(); } + bool supportsCpuIs() const override { + llvm::Triple Triple = getTriple(); + // AIX 7.2 is the minimum requirement to support __builtin_cpu_is(). + return Triple.isOSGlibc() || + (Triple.isOSAIX() && + !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR)); + } bool validateCpuSupports(StringRef Feature) const override; bool validateCpuIs(StringRef Name) const override; }; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d454ccc1dd861..d8b2115f1e5e3 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16542,12 +16542,59 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, Intrinsic::ID ID = Intrinsic::not_intrinsic; +#include "llvm/TargetParser/PPCTargetParser.def" + auto GenAIXPPCBuiltinCpuExpr = [&](unsigned SupportMethod, unsigned FieldIdx, + unsigned CompOp, + unsigned OpValue) -> Value * { + if (SupportMethod == AIX_BUILTIN_PPC_FALSE) + return llvm::ConstantInt::getFalse(ConvertType(E->getType())); + + if (SupportMethod == AIX_BUILTIN_PPC_TRUE) + return llvm::ConstantInt::getTrue(ConvertType(E->getType())); + + assert(SupportMethod <= USE_SYS_CONF && "Invalid value for SupportMethod."); + assert((CompOp == COMP_EQ) && "Only equal comparisons are supported."); + + llvm::Type *STy = llvm::StructType::get(PPC_SYSTEMCONFIG_TYPE); + llvm::Constant *SysConf = + CGM.CreateRuntimeVariable(STy, "_system_configuration"); + + // Grab the appropriate field from _system_configuration. + llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, FieldIdx)}; + + llvm::Value *FieldValue = Builder.CreateGEP(STy, SysConf, Idxs); + FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue, + CharUnits::fromQuantity(4)); + assert(FieldValue->getType()->isIntegerTy(32) && + "Only 32-bit integers are supported in GenAIXPPCBuiltinCpuExpr()."); + return Builder.CreateICmp(ICmpInst::ICMP_EQ, FieldValue, + ConstantInt::get(Int32Ty, OpValue)); + }; + switch (BuiltinID) { default: return nullptr; case Builtin::BI__builtin_cpu_is: { const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts(); StringRef CPUStr = cast(CPUExpr)->getString(); + llvm::Triple Triple = getTarget().getTriple(); + + if (Triple.isOSAIX()) { + unsigned IsCpuSupport, FieldIdx, CompareOp, CpuIdValue; + typedef std::tuple CPUType; + std::tie(IsCpuSupport, FieldIdx, CompareOp, CpuIdValue) = + static_cast(StringSwitch(CPUStr) +#define PPC_AIX_CPU(NAME, SUPPORT_MAGIC, INDEX, COMPARE_OP, VALUE) \ + .Case(NAME, {SUPPORT_MAGIC, INDEX, COMPARE_OP, VALUE}) +#include "llvm/TargetParser/PPCTargetParser.def" + ); + return GenAIXPPCBuiltinCpuExpr(IsCpuSupport, FieldIdx, CompareOp, + CpuIdValue); + } + + assert(Triple.isOSLinux() && + "__builtin_cpu_is() is only supported for AIX and Linux."); unsigned NumCPUID = StringSwitch(CPUStr) #define PPC_LNX_CPU(Name, NumericID) .Case(Name, NumericID) #include "llvm/TargetParser/PPCTargetParser.def" diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index e8bfb215a5b4c..710437b354521 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2165,7 +2165,10 @@ static bool SemaBuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall, return S.Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported) << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc()); if (!IsCPUSupports && !TheTI->supportsCpuIs()) - return S.Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported) + return S.Diag(TheCall->getBeginLoc(), + TI.getTriple().isOSAIX() + ? diag::err_builtin_aix_os_unsupported + : diag::err_builtin_target_unsupported) << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc()); Expr *Arg = TheCall->getArg(0)->IgnoreParenImpCasts(); diff --git a/clang/test/CodeGen/aix-builtin-cpu-is.c b/clang/test/CodeGen/aix-builtin-cpu-is.c new file mode 100644 index 0000000000000..b0a0dec41b56c --- /dev/null +++ b/clang/test/CodeGen/aix-builtin-cpu-is.c @@ -0,0 +1,71 @@ +// RUN: echo "int main() { return __builtin_cpu_is(\"ppc970\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"ppc-cell-be\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"ppca2\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"ppc405\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"ppc440\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"ppc464\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"ppc476\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"power4\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"power5\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"power5+\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"power6\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"power6x\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s + +// RUN: echo "int main() { return __builtin_cpu_is(\"power7\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=32768 \ +// RUN: --check-prefix=CHECKOP + +// RUN: echo "int main() { return __builtin_cpu_is(\"power8\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=65536 \ +// RUN: --check-prefix=CHECKOP + +// RUN: echo "int main() { return __builtin_cpu_is(\"power9\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=131072\ +// RUN: --check-prefix=CHECKOP + +// RUN: echo "int main() { return __builtin_cpu_is(\"power10\");}" > %t.c +// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=262144 \ +// RUN: --check-prefix=CHECKOP + +// CHECK: define i32 @main() #0 { +// CHECK-NEXT: entry: +// CHECK-NEXT: %retval = alloca i32, align 4 +// CHECK-NEXT: store i32 0, ptr %retval, align 4 +// CHECK-NEXT: ret i32 0 +// CHECK-NEXT: } + +// CHECKOP: @_system_configuration = external global { i32, i32, i32 } +// CHECKOP: define i32 @main() #0 { +// CHECKOP-NEXT: entry: +// CHECKOP-NEXT: %retval = alloca i32, align 4 +// CHECKOP-NEXT: store i32 0, ptr %retval, align 4 +// CHECKOP-NEXT: %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32 }, ptr @_system_configuration, i32 0, i32 1), align 4 +// CHECKOP-NEXT: %1 = icmp eq i32 %0, [[VALUE]] +// CHECKOP-NEXT: %conv = zext i1 %1 to i32 +// CHECKOP-NEXT: ret i32 %conv +// CHECKOP-NEXT: } + + diff --git a/clang/test/Sema/aix-builtin-cpu-unsupports.c b/clang/test/Sema/aix-builtin-cpu-unsupports.c new file mode 100644 index 0000000000000..10e21867c3937 --- /dev/null +++ b/clang/test/Sema/aix-builtin-cpu-unsupports.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -fsyntax-only -triple powerpc-ibm-aix7.1.0.0 -verify %s + +int main(void) { + if (__builtin_cpu_is("power8")) // expected-error {{this builtin is available only on AIX 7.2 and later operating systems}} + return 1; +} diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.def b/llvm/include/llvm/TargetParser/PPCTargetParser.def index f2c44b46fa673..88c7304659c4d 100644 --- a/llvm/include/llvm/TargetParser/PPCTargetParser.def +++ b/llvm/include/llvm/TargetParser/PPCTargetParser.def @@ -126,4 +126,61 @@ PPC_LNX_CPU("power10",47) #undef PPC_LNX_DEFINE_OFFSETS #undef PPC_LNX_FEATURE #undef PPC_LNX_CPU + +// Definition of the following values are found in the AIX header +// file: . +#ifndef AIX_POWERPC_USE_SYS_CONF + #define AIX_POWERPC_USE_SYS_CONF + #define AIX_SYSCON_IMPL_IDX 1 + #define AIX_PPC7_VALUE 0x00008000 + #define AIX_PPC8_VALUE 0x00010000 + #define AIX_PPC9_VALUE 0x00020000 + #define AIX_PPC10_VALUE 0x00040000 + + // Supported SUPPORT_METHOD values. + #define AIX_BUILTIN_PPC_TRUE 1 + #define AIX_BUILTIN_PPC_FALSE 0 + #define USE_SYS_CONF 2 + + // Supported COMPARE_OP values. + #define COMP_EQ 0 + +#endif + +// The value of SUPPORT_METHOD can be AIX_BUILTIN_PPC_TRUE, +// AIX_BUILTIN_PPC_FALSE, or USE_SYS_CONF. +// When the value of SUPPORT_METHOD is USE_SYS_CONF, the return value +// depends on the result of comparing the data member of +// _system_configuration specified by INDEX with a certain value. + +#ifndef PPC_AIX_CPU + #define PPC_AIX_CPU(NAME, SUPPORT_METHOD, INDEX, COMPARE_OP, VALUE) +#endif + +// __builtin_cpu_is() is supported only on Power7 and up. +PPC_AIX_CPU("power4",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppc970",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("power5",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("power5+",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("power6",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppc-cell-be",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("power6x",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppca2",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppc405",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppc440",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppc464",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("ppc476",AIX_BUILTIN_PPC_FALSE,0,0,0) +PPC_AIX_CPU("power7",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC7_VALUE) +PPC_AIX_CPU("power8",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC8_VALUE) +PPC_AIX_CPU("power9",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC9_VALUE) +PPC_AIX_CPU("power10",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC10_VALUE) +#undef PPC_AIX_CPU + +// PPC_SYSTEMCONFIG_TYPE defines the IR data structure of kernel variable +// `_system_configuration`, that is found in the AIX OS header file: . +#ifndef PPC_SYSTEMCONFIG_TYPE +#define PPC_SYSTEMCONFIG_TYPE \ +Int32Ty, Int32Ty, Int32Ty +#endif + #endif // !PPC_TGT_PARSER_UNDEF_MACROS From cbb24e139d0753d755d17fbe6bfac48ab44d0721 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 22 Feb 2024 14:07:16 +0000 Subject: [PATCH 217/351] [LLVM][IR] Add native vector support to ConstantInt & ConstantFP. (#74502) NOTE: For brevity the following talks about ConstantInt but everything extends to cover ConstantFP as well. Whilst ConstantInt::get() supports the creation of vectors whereby each lane has the same value, it achieves this via other constants: * ConstantVector for fixed-length vectors * ConstantExprs for scalable vectors However, ConstantExprs are being deprecated and ConstantVector is not space efficient for larger vector types. By extending ConstantInt we can represent vector splats by only storing the underlying scalar value. More specifically: * ConstantInt gains an ElementCount variant of get(). * LLVMContext is extended to map ->ConstantInt. * BitcodeReader/Writer support is extended to allow vector types. Whilst this patch adds the base support, more work is required before it's production ready. For example, there's likely to be many places where isa assumes a scalar type. Accordingly the default behaviour of ConstantInt::get() remains unchanged but a set of flags are added to allow wider testing and thus help with the migration: --use-constant-int-for-fixed-length-splat --use-constant-fp-for-fixed-length-splat --use-constant-int-for-scalable-splat --use-constant-fp-for-scalable-splat NOTE: No change is required to the bitcode format because types and values are handled separately. NOTE: For similar reasons as above, code generation doesn't work out-the-box. --- llvm/include/llvm/IR/Constants.h | 18 ++++- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 55 ++++++------- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 +- llvm/lib/IR/AsmWriter.cpp | 31 +++++++- llvm/lib/IR/Constants.cpp | 94 +++++++++++++++++++++-- llvm/lib/IR/LLVMContextImpl.cpp | 2 + llvm/lib/IR/LLVMContextImpl.h | 4 + llvm/test/Bitcode/constant-splat.ll | 76 ++++++++++++++++++ 8 files changed, 243 insertions(+), 39 deletions(-) create mode 100644 llvm/test/Bitcode/constant-splat.ll diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index b5dcc7fbc1d92..c0ac9a4aa6750 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -78,13 +78,20 @@ class ConstantData : public Constant { /// Class for constant integers. class ConstantInt final : public ConstantData { friend class Constant; + friend class ConstantVector; APInt Val; - ConstantInt(IntegerType *Ty, const APInt &V); + ConstantInt(Type *Ty, const APInt &V); void destroyConstantImpl(); + /// Return a ConstantInt with the specified value and an implied Type. The + /// type is the vector type whose integer element type corresponds to the bit + /// width of the value. + static ConstantInt *get(LLVMContext &Context, ElementCount EC, + const APInt &V); + public: ConstantInt(const ConstantInt &) = delete; @@ -136,7 +143,7 @@ class ConstantInt final : public ConstantData { /// Return the constant's value. inline const APInt &getValue() const { return Val; } - /// getBitWidth - Return the bitwidth of this constant. + /// getBitWidth - Return the scalar bitwidth of this constant. unsigned getBitWidth() const { return Val.getBitWidth(); } /// Return the constant as a 64-bit unsigned integer value after it @@ -259,6 +266,7 @@ class ConstantInt final : public ConstantData { /// class ConstantFP final : public ConstantData { friend class Constant; + friend class ConstantVector; APFloat Val; @@ -266,6 +274,12 @@ class ConstantFP final : public ConstantData { void destroyConstantImpl(); + /// Return a ConstantFP with the specified value and an implied Type. The + /// type is the vector type whose element type has the same floating point + /// semantics as the value. + static ConstantFP *get(LLVMContext &Context, ElementCount EC, + const APFloat &V); + public: ConstantFP(const ConstantFP &) = delete; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 515a1d0caa041..832907a3f53f5 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -3060,48 +3060,49 @@ Error BitcodeReader::parseConstants() { V = Constant::getNullValue(CurTy); break; case bitc::CST_CODE_INTEGER: // INTEGER: [intval] - if (!CurTy->isIntegerTy() || Record.empty()) + if (!CurTy->isIntOrIntVectorTy() || Record.empty()) return error("Invalid integer const record"); V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0])); break; case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval] - if (!CurTy->isIntegerTy() || Record.empty()) + if (!CurTy->isIntOrIntVectorTy() || Record.empty()) return error("Invalid wide integer const record"); - APInt VInt = - readWideAPInt(Record, cast(CurTy)->getBitWidth()); - V = ConstantInt::get(Context, VInt); - + auto *ScalarTy = cast(CurTy->getScalarType()); + APInt VInt = readWideAPInt(Record, ScalarTy->getBitWidth()); + V = ConstantInt::get(CurTy, VInt); break; } case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval] if (Record.empty()) return error("Invalid float const record"); - if (CurTy->isHalfTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(), - APInt(16, (uint16_t)Record[0]))); - else if (CurTy->isBFloatTy()) - V = ConstantFP::get(Context, APFloat(APFloat::BFloat(), - APInt(16, (uint32_t)Record[0]))); - else if (CurTy->isFloatTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(), - APInt(32, (uint32_t)Record[0]))); - else if (CurTy->isDoubleTy()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(), - APInt(64, Record[0]))); - else if (CurTy->isX86_FP80Ty()) { + + auto *ScalarTy = CurTy->getScalarType(); + if (ScalarTy->isHalfTy()) + V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEhalf(), + APInt(16, (uint16_t)Record[0]))); + else if (ScalarTy->isBFloatTy()) + V = ConstantFP::get( + CurTy, APFloat(APFloat::BFloat(), APInt(16, (uint32_t)Record[0]))); + else if (ScalarTy->isFloatTy()) + V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEsingle(), + APInt(32, (uint32_t)Record[0]))); + else if (ScalarTy->isDoubleTy()) + V = ConstantFP::get( + CurTy, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0]))); + else if (ScalarTy->isX86_FP80Ty()) { // Bits are not stored the same way as a normal i80 APInt, compensate. uint64_t Rearrange[2]; Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16); Rearrange[1] = Record[0] >> 48; - V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(), - APInt(80, Rearrange))); - } else if (CurTy->isFP128Ty()) - V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(), - APInt(128, Record))); - else if (CurTy->isPPC_FP128Ty()) - V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(), - APInt(128, Record))); + V = ConstantFP::get( + CurTy, APFloat(APFloat::x87DoubleExtended(), APInt(80, Rearrange))); + } else if (ScalarTy->isFP128Ty()) + V = ConstantFP::get(CurTy, + APFloat(APFloat::IEEEquad(), APInt(128, Record))); + else if (ScalarTy->isPPC_FP128Ty()) + V = ConstantFP::get( + CurTy, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record))); else V = UndefValue::get(CurTy); break; diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 13be0b0c3307f..656f2a6ce870f 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2624,7 +2624,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, } } else if (const ConstantFP *CFP = dyn_cast(C)) { Code = bitc::CST_CODE_FLOAT; - Type *Ty = CFP->getType(); + Type *Ty = CFP->getType()->getScalarType(); if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() || Ty->isDoubleTy()) { Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 251485a403fee..ac0f119b00bde 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1505,16 +1505,39 @@ static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) { static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, AsmWriterContext &WriterCtx) { if (const ConstantInt *CI = dyn_cast(CV)) { - if (CI->getType()->isIntegerTy(1)) { - Out << (CI->getZExtValue() ? "true" : "false"); - return; + Type *Ty = CI->getType(); + + if (Ty->isVectorTy()) { + Out << "splat ("; + WriterCtx.TypePrinter->print(Ty->getScalarType(), Out); + Out << " "; } - Out << CI->getValue(); + + if (Ty->getScalarType()->isIntegerTy(1)) + Out << (CI->getZExtValue() ? "true" : "false"); + else + Out << CI->getValue(); + + if (Ty->isVectorTy()) + Out << ")"; + return; } if (const ConstantFP *CFP = dyn_cast(CV)) { + Type *Ty = CFP->getType(); + + if (Ty->isVectorTy()) { + Out << "splat ("; + WriterCtx.TypePrinter->print(Ty->getScalarType(), Out); + Out << " "; + } + WriteAPFloatInternal(Out, CFP->getValueAPF()); + + if (Ty->isVectorTy()) + Out << ")"; + return; } diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a38b912164b13..e6b92aad392f6 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -35,6 +35,20 @@ using namespace llvm; using namespace PatternMatch; +// As set of temporary options to help migrate how splats are represented. +static cl::opt UseConstantIntForFixedLengthSplat( + "use-constant-int-for-fixed-length-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantInt's native fixed-length vector splat support.")); +static cl::opt UseConstantFPForFixedLengthSplat( + "use-constant-fp-for-fixed-length-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantFP's native fixed-length vector splat support.")); +static cl::opt UseConstantIntForScalableSplat( + "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantInt's native scalable vector splat support.")); +static cl::opt UseConstantFPForScalableSplat( + "use-constant-fp-for-scalable-splat", cl::init(false), cl::Hidden, + cl::desc("Use ConstantFP's native scalable vector splat support.")); + //===----------------------------------------------------------------------===// // Constant Class //===----------------------------------------------------------------------===// @@ -825,9 +839,11 @@ bool Constant::isManifestConstant() const { // ConstantInt //===----------------------------------------------------------------------===// -ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V) +ConstantInt::ConstantInt(Type *Ty, const APInt &V) : ConstantData(Ty, ConstantIntVal), Val(V) { - assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type"); + assert(V.getBitWidth() == + cast(Ty->getScalarType())->getBitWidth() && + "Invalid constant for type"); } ConstantInt *ConstantInt::getTrue(LLVMContext &Context) { @@ -885,6 +901,26 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) { return Slot.get(); } +// Get a ConstantInt vector with each lane set to the same APInt. +ConstantInt *ConstantInt::get(LLVMContext &Context, ElementCount EC, + const APInt &V) { + // Get an existing value or the insertion position. + std::unique_ptr &Slot = + Context.pImpl->IntSplatConstants[std::make_pair(EC, V)]; + if (!Slot) { + IntegerType *ITy = IntegerType::get(Context, V.getBitWidth()); + VectorType *VTy = VectorType::get(ITy, EC); + Slot.reset(new ConstantInt(VTy, V)); + } + +#ifndef NDEBUG + IntegerType *ITy = IntegerType::get(Context, V.getBitWidth()); + VectorType *VTy = VectorType::get(ITy, EC); + assert(Slot->getType() == VTy); +#endif + return Slot.get(); +} + Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) { Constant *C = get(cast(Ty->getScalarType()), V, isSigned); @@ -1024,6 +1060,26 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) { return Slot.get(); } +// Get a ConstantFP vector with each lane set to the same APFloat. +ConstantFP *ConstantFP::get(LLVMContext &Context, ElementCount EC, + const APFloat &V) { + // Get an existing value or the insertion position. + std::unique_ptr &Slot = + Context.pImpl->FPSplatConstants[std::make_pair(EC, V)]; + if (!Slot) { + Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics()); + VectorType *VTy = VectorType::get(EltTy, EC); + Slot.reset(new ConstantFP(VTy, V)); + } + +#ifndef NDEBUG + Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics()); + VectorType *VTy = VectorType::get(EltTy, EC); + assert(Slot->getType() == VTy); +#endif + return Slot.get(); +} + Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) { const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics(); Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative)); @@ -1036,7 +1092,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) { ConstantFP::ConstantFP(Type *Ty, const APFloat &V) : ConstantData(Ty, ConstantFPVal), Val(V) { - assert(&V.getSemantics() == &Ty->getFltSemantics() && + assert(&V.getSemantics() == &Ty->getScalarType()->getFltSemantics() && "FP type Mismatch"); } @@ -1356,11 +1412,13 @@ Constant *ConstantVector::getImpl(ArrayRef V) { bool isZero = C->isNullValue(); bool isUndef = isa(C); bool isPoison = isa(C); + bool isSplatFP = UseConstantFPForFixedLengthSplat && isa(C); + bool isSplatInt = UseConstantIntForFixedLengthSplat && isa(C); - if (isZero || isUndef) { + if (isZero || isUndef || isSplatFP || isSplatInt) { for (unsigned i = 1, e = V.size(); i != e; ++i) if (V[i] != C) { - isZero = isUndef = isPoison = false; + isZero = isUndef = isPoison = isSplatFP = isSplatInt = false; break; } } @@ -1371,6 +1429,12 @@ Constant *ConstantVector::getImpl(ArrayRef V) { return PoisonValue::get(T); if (isUndef) return UndefValue::get(T); + if (isSplatFP) + return ConstantFP::get(C->getContext(), T->getElementCount(), + cast(C)->getValue()); + if (isSplatInt) + return ConstantInt::get(C->getContext(), T->getElementCount(), + cast(C)->getValue()); // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. @@ -1384,6 +1448,16 @@ Constant *ConstantVector::getImpl(ArrayRef V) { Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) { if (!EC.isScalable()) { + // Maintain special handling of zero. + if (!V->isNullValue()) { + if (UseConstantIntForFixedLengthSplat && isa(V)) + return ConstantInt::get(V->getContext(), EC, + cast(V)->getValue()); + if (UseConstantFPForFixedLengthSplat && isa(V)) + return ConstantFP::get(V->getContext(), EC, + cast(V)->getValue()); + } + // If this splat is compatible with ConstantDataVector, use it instead of // ConstantVector. if ((isa(V) || isa(V)) && @@ -1394,6 +1468,16 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) { return get(Elts); } + // Maintain special handling of zero. + if (!V->isNullValue()) { + if (UseConstantIntForScalableSplat && isa(V)) + return ConstantInt::get(V->getContext(), EC, + cast(V)->getValue()); + if (UseConstantFPForScalableSplat && isa(V)) + return ConstantFP::get(V->getContext(), EC, + cast(V)->getValue()); + } + Type *VTy = VectorType::get(V->getType(), EC); if (V->isNullValue()) diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp index 15c90a4fe7b2e..a0bf9cae7926b 100644 --- a/llvm/lib/IR/LLVMContextImpl.cpp +++ b/llvm/lib/IR/LLVMContextImpl.cpp @@ -119,7 +119,9 @@ LLVMContextImpl::~LLVMContextImpl() { IntZeroConstants.clear(); IntOneConstants.clear(); IntConstants.clear(); + IntSplatConstants.clear(); FPConstants.clear(); + FPSplatConstants.clear(); CDSConstants.clear(); // Destroy attribute node lists. diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 6a20291344989..2ee1080a1ffa2 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -1488,8 +1488,12 @@ class LLVMContextImpl { DenseMap> IntZeroConstants; DenseMap> IntOneConstants; DenseMap> IntConstants; + DenseMap, std::unique_ptr> + IntSplatConstants; DenseMap> FPConstants; + DenseMap, std::unique_ptr> + FPSplatConstants; FoldingSet AttrsSet; FoldingSet AttrsLists; diff --git a/llvm/test/Bitcode/constant-splat.ll b/llvm/test/Bitcode/constant-splat.ll new file mode 100644 index 0000000000000..2bcc3ddf3e4f3 --- /dev/null +++ b/llvm/test/Bitcode/constant-splat.ll @@ -0,0 +1,76 @@ +; RUN: llvm-as -use-constant-int-for-fixed-length-splat \ +; RUN: -use-constant-fp-for-fixed-length-splat \ +; RUN: -use-constant-int-for-scalable-splat \ +; RUN: -use-constant-fp-for-scalable-splat \ +; RUN: < %s | llvm-dis -use-constant-int-for-fixed-length-splat \ +; RUN: -use-constant-fp-for-fixed-length-splat \ +; RUN: -use-constant-int-for-scalable-splat \ +; RUN: -use-constant-fp-for-scalable-splat \ +; RUN: | FileCheck %s + +; CHECK: @constant.splat.i1 = constant <1 x i1> splat (i1 true) +@constant.splat.i1 = constant <1 x i1> splat (i1 true) + +; CHECK: @constant.splat.i32 = constant <5 x i32> splat (i32 7) +@constant.splat.i32 = constant <5 x i32> splat (i32 7) + +; CHECK: @constant.splat.i128 = constant <7 x i128> splat (i128 85070591730234615870450834276742070272) +@constant.splat.i128 = constant <7 x i128> splat (i128 85070591730234615870450834276742070272) + +; CHECK: @constant.splat.f16 = constant <2 x half> splat (half 0xHBC00) +@constant.splat.f16 = constant <2 x half> splat (half 0xHBC00) + +; CHECK: @constant.splat.f32 = constant <4 x float> splat (float -2.000000e+00) +@constant.splat.f32 = constant <4 x float> splat (float -2.000000e+00) + +; CHECK: @constant.splat.f64 = constant <6 x double> splat (double -3.000000e+00) +@constant.splat.f64 = constant <6 x double> splat (double -3.000000e+00) + +; CHECK: @constant.splat.128 = constant <8 x fp128> splat (fp128 0xL00000000000000018000000000000000) +@constant.splat.128 = constant <8 x fp128> splat (fp128 0xL00000000000000018000000000000000) + +; CHECK: @constant.splat.bf16 = constant <1 x bfloat> splat (bfloat 0xRC0A0) +@constant.splat.bf16 = constant <1 x bfloat> splat (bfloat 0xRC0A0) + +; CHECK: @constant.splat.x86_fp80 = constant <3 x x86_fp80> splat (x86_fp80 0xK4000C8F5C28F5C28F800) +@constant.splat.x86_fp80 = constant <3 x x86_fp80> splat (x86_fp80 0xK4000C8F5C28F5C28F800) + +; CHECK: @constant.splat.ppc_fp128 = constant <7 x ppc_fp128> splat (ppc_fp128 0xM80000000000000000000000000000000) +@constant.splat.ppc_fp128 = constant <7 x ppc_fp128> splat (ppc_fp128 0xM80000000000000000000000000000000) + +define void @add_fixed_lenth_vector_splat_i32(<4 x i32> %a) { +; CHECK: %add = add <4 x i32> %a, splat (i32 137) + %add = add <4 x i32> %a, splat (i32 137) + ret void +} + +define <4 x i32> @ret_fixed_lenth_vector_splat_i32() { +; CHECK: ret <4 x i32> splat (i32 56) + ret <4 x i32> splat (i32 56) +} + +define void @add_fixed_lenth_vector_splat_double( %a) { +; CHECK: %add = fadd %a, splat (double 5.700000e+00) + %add = fadd %a, splat (double 5.700000e+00) + ret void +} + +define @ret_scalable_vector_splat_i32() { +; CHECK: ret splat (i32 78) + ret splat (i32 78) +} + +define <4 x i32> @canonical_constant_vector() { +; CHECK: ret <4 x i32> splat (i32 7) + ret <4 x i32> +} + +define <4 x i32> @canonical_fixed_lnegth_vector_zero() { +; CHECK: ret <4 x i32> zeroinitializer + ret <4 x i32> zeroinitializer +} + +define @canonical_scalable_lnegth_vector_zero() { +; CHECK: ret zeroinitializer + ret zeroinitializer +} From 88e31f64a034ec6dead2106016ee5b797674edb0 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 22 Feb 2024 08:13:41 -0600 Subject: [PATCH 218/351] [OpenMP][FIX] Remove unsound omp_get_thread_limit deduplication (#79524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deduplication of the calls to `omp_get_thread_limit` used to be legal when originally added in , as the result (thread_limit) was immutable. However, now that we have `thread_limit` clause, we no longer have immutability; therefore `omp_get_thread_limit()` is not a deduplicable runtime call. Thus, removing `omp_get_thread_limit` from the `DeduplicableRuntimeCallIDs` array. Here's a simple example: ``` #include #include int main() { #pragma omp target thread_limit(4) { printf("\n1:target thread_limit: %d\n", omp_get_thread_limit()); } #pragma omp target thread_limit(3) { printf("\n2:target thread_limit: %d\n", omp_get_thread_limit()); } return 0; } ``` GCC-compiled binary execution: https://gcc.godbolt.org/z/Pjv3TWoTq ``` 1:target thread_limit: 4 2:target thread_limit: 3 ``` Clang/LLVM-compiled binary execution: https://clang.godbolt.org/z/zdPbrdMPn ``` 1:target thread_limit: 4 2:target thread_limit: 4 ``` By my reading of the OpenMP spec GCC does the right thing here; cf. : > If a target construct with a thread_limit clause is encountered, the thread-limit-var ICV from the data environment of the generated initial task is instead set to an implementation defined value between one and the value specified in the clause. The common subexpression elimination (CSE) of the second call to `omp_get_thread_limit` by LLVM does not seem to be correct, as it's not an available expression at any program point(s) (in the scope of the clause in question) after the second target construct with a `thread_limit` clause is encountered. Compiling with `-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt` we have: https://clang.godbolt.org/z/G7dfhP7jh ``` :8:42: remark: OpenMP runtime call omp_get_thread_limit deduplicated. [OMP170] [-Rpass=openmp-opt] 8 | printf("\n1:target thread_limit: %d\n",omp_get_thread_limit()); | ^ ``` OMP170 has the following explanation: https://openmp.llvm.org/remarks/OMP170.html > This optimization remark indicates that a call to an OpenMP runtime call was replaced with the result of an existing one. This occurs when the compiler knows that the result of a runtime call is immutable. Removing duplicate calls is done by replacing all calls to that function with the result of the first call. This cannot be done automatically by the compiler because the implementations of the OpenMP runtime calls live in a separate library the compiler cannot see. This optimization will trigger for known OpenMP runtime calls whose return value will not change. At the same time I do not believe we have an analysis checking whether this precondition holds here: "This occurs when the compiler knows that the result of a runtime call is immutable." AFAICT, such analysis doesn't appear to exist in the original patch introducing deduplication, either: - https://github.com/llvm/llvm-project/commit/9548b74a831ea005649465797f359e0521f3b8a9 - https://reviews.llvm.org/D69930 The fix is to remove it from `DeduplicableRuntimeCallIDs`, effectively reverting the addition in this commit (noting that `omp_get_max_threads` is not present in `DeduplicableRuntimeCallIDs`, so it's possible this addition was incorrect in the first place): - [OpenMP][Opt] Annotate known runtime functions and deduplicate more, - https://github.com/llvm/llvm-project/commit/e28936f6137c5a9c4f7673e248c192a9811543b6#diff-de101c82aff66b2bda2d1f53fde3dde7b0d370f14f1ff37b7919ce38531230dfR123 As a result, we're no longer unsoundly deduplicating the OpenMP runtime call `omp_get_thread_limit` as illustrated by the test case: Note the (correctly) repeated `call i32 @omp_get_thread_limit()`. --------- Co-authored-by: Joseph Huber --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 1 - .../OpenMP/deduplication_soundness.ll | 59 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/OpenMP/deduplication_soundness.ll diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 4176d561363fb..77ca36d64029f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1471,7 +1471,6 @@ struct OpenMPOpt { OMPRTL_omp_get_num_threads, OMPRTL_omp_in_parallel, OMPRTL_omp_get_cancellation, - OMPRTL_omp_get_thread_limit, OMPRTL_omp_get_supported_active_levels, OMPRTL_omp_get_level, OMPRTL_omp_get_ancestor_thread_num, diff --git a/llvm/test/Transforms/OpenMP/deduplication_soundness.ll b/llvm/test/Transforms/OpenMP/deduplication_soundness.ll new file mode 100644 index 0000000000000..9dd3219175fea --- /dev/null +++ b/llvm/test/Transforms/OpenMP/deduplication_soundness.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --scrub-attributes --filter "@omp_get_thread_limit|@use" --version 4 +; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s + +declare void @use(i32 noundef) +declare i32 @omp_get_thread_limit() +declare void @__kmpc_set_thread_limit(ptr, i32, i32) +declare i32 @__kmpc_global_thread_num(ptr) +declare noalias ptr @__kmpc_omp_task_alloc(ptr, i32, i32, i64, i64, ptr) +declare void @__kmpc_omp_task_complete_if0(ptr, i32, ptr) +declare void @__kmpc_omp_task_begin_if0(ptr, i32, ptr) + +%struct.ident_t = type { i32, i32, i32, i32, ptr } + +@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8 + +define i32 @main() local_unnamed_addr { +; CHECK-LABEL: define i32 @main() local_unnamed_addr { +; CHECK: [[CALL_I_I_I:%.*]] = call i32 @omp_get_thread_limit() +; CHECK: call void @use(i32 noundef [[CALL_I_I_I]]) +; CHECK: [[CALL_I_I_I2:%.*]] = call i32 @omp_get_thread_limit() +; CHECK: call void @use(i32 noundef [[CALL_I_I_I2]]) +; +entry: + %0 = call i32 @__kmpc_global_thread_num(ptr nonnull @1) + %1 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry.) + call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %1) + call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4) + %call.i.i.i = call i32 @omp_get_thread_limit() + call void @use(i32 noundef %call.i.i.i) + call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %1) + %2 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry..2) + call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %2) + call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3) + %call.i.i.i2 = call i32 @omp_get_thread_limit() + call void @use(i32 noundef %call.i.i.i2) + call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %2) + ret i32 0 +} + +define internal noundef i32 @.omp_task_entry.(i32 noundef %0, ptr noalias nocapture noundef readonly %1) { +entry: + tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4) + %call.i.i = tail call i32 @omp_get_thread_limit() + tail call void @use(i32 noundef %call.i.i) + ret i32 0 +} + +define internal noundef i32 @.omp_task_entry..2(i32 noundef %0, ptr noalias nocapture noundef readonly %1) { +entry: + tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3) + %call.i.i = tail call i32 @omp_get_thread_limit() + tail call void @use(i32 noundef %call.i.i) + ret i32 0 +} + +!llvm.module.flags = !{!0} + +!0 = !{i32 7, !"openmp", i32 51} From d3f6dd6585f4866a38a794b80db55a62c1050c77 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 22 Feb 2024 15:25:17 +0100 Subject: [PATCH 219/351] [InstCombine] Pick bfloat over half when shrinking ops that started with an fpext from bfloat (#82493) This fixes the case where we would shrink an frem to half and then bitcast to bfloat, producing invalid results. The transformation was written under the assumption that there is only one type with a given bit width. Also add a strategic assert to CastInst::CreateFPCast to turn this miscompilation into a crash. --- llvm/lib/IR/Instructions.cpp | 1 + .../InstCombine/InstCombineCasts.cpp | 23 +++++++++++-------- llvm/test/Transforms/InstCombine/fpextend.ll | 11 +++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index ce0df53d9ffb9..fc5c9b201487e 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -3525,6 +3525,7 @@ CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, "Invalid cast"); unsigned SrcBits = C->getType()->getScalarSizeInBits(); unsigned DstBits = Ty->getScalarSizeInBits(); + assert((C->getType() == Ty || SrcBits != DstBits) && "Invalid cast"); Instruction::CastOps opcode = (SrcBits == DstBits ? Instruction::BitCast : (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt)); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index ed47de287302e..33ed1d5575375 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1543,11 +1543,14 @@ static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { return !losesInfo; } -static Type *shrinkFPConstant(ConstantFP *CFP) { +static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) { if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext())) return nullptr; // No constant folding of this. + // See if the value can be truncated to bfloat and then reextended. + if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat())) + return Type::getBFloatTy(CFP->getContext()); // See if the value can be truncated to half and then reextended. - if (fitsInFPType(CFP, APFloat::IEEEhalf())) + if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf())) return Type::getHalfTy(CFP->getContext()); // See if the value can be truncated to float and then reextended. if (fitsInFPType(CFP, APFloat::IEEEsingle())) @@ -1562,7 +1565,7 @@ static Type *shrinkFPConstant(ConstantFP *CFP) { // Determine if this is a vector of ConstantFPs and if so, return the minimal // type we can safely truncate all elements to. -static Type *shrinkFPConstantVector(Value *V) { +static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) { auto *CV = dyn_cast(V); auto *CVVTy = dyn_cast(V->getType()); if (!CV || !CVVTy) @@ -1582,7 +1585,7 @@ static Type *shrinkFPConstantVector(Value *V) { if (!CFP) return nullptr; - Type *T = shrinkFPConstant(CFP); + Type *T = shrinkFPConstant(CFP, PreferBFloat); if (!T) return nullptr; @@ -1597,7 +1600,7 @@ static Type *shrinkFPConstantVector(Value *V) { } /// Find the minimum FP type we can safely truncate to. -static Type *getMinimumFPType(Value *V) { +static Type *getMinimumFPType(Value *V, bool PreferBFloat) { if (auto *FPExt = dyn_cast(V)) return FPExt->getOperand(0)->getType(); @@ -1605,7 +1608,7 @@ static Type *getMinimumFPType(Value *V) { // that can accurately represent it. This allows us to turn // (float)((double)X+2.0) into x+2.0f. if (auto *CFP = dyn_cast(V)) - if (Type *T = shrinkFPConstant(CFP)) + if (Type *T = shrinkFPConstant(CFP, PreferBFloat)) return T; // We can only correctly find a minimum type for a scalable vector when it is @@ -1617,7 +1620,7 @@ static Type *getMinimumFPType(Value *V) { // Try to shrink a vector of FP constants. This returns nullptr on scalable // vectors - if (Type *T = shrinkFPConstantVector(V)) + if (Type *T = shrinkFPConstantVector(V, PreferBFloat)) return T; return V->getType(); @@ -1686,8 +1689,10 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { Type *Ty = FPT.getType(); auto *BO = dyn_cast(FPT.getOperand(0)); if (BO && BO->hasOneUse()) { - Type *LHSMinType = getMinimumFPType(BO->getOperand(0)); - Type *RHSMinType = getMinimumFPType(BO->getOperand(1)); + Type *LHSMinType = + getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy()); + Type *RHSMinType = + getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy()); unsigned OpWidth = BO->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); diff --git a/llvm/test/Transforms/InstCombine/fpextend.ll b/llvm/test/Transforms/InstCombine/fpextend.ll index a41f2a4ca300f..19f512d717a97 100644 --- a/llvm/test/Transforms/InstCombine/fpextend.ll +++ b/llvm/test/Transforms/InstCombine/fpextend.ll @@ -437,3 +437,14 @@ define half @bf16_to_f32_to_f16(bfloat %a) nounwind { %z = fptrunc float %y to half ret half %z } + +define bfloat @bf16_frem(bfloat %x) { +; CHECK-LABEL: @bf16_frem( +; CHECK-NEXT: [[FREM:%.*]] = frem bfloat [[X:%.*]], 0xR40C9 +; CHECK-NEXT: ret bfloat [[FREM]] +; + %t1 = fpext bfloat %x to float + %t2 = frem float %t1, 6.281250e+00 + %t3 = fptrunc float %t2 to bfloat + ret bfloat %t3 +} From 9dbedcac1243e8e99103bdff37da51dded67b766 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Thu, 22 Feb 2024 06:28:12 -0800 Subject: [PATCH 220/351] [build] Check RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES for libc also (#82561) When checking whether we need to build libc-hdrgen, we need to check LLVM_ENABLE_RUNTIMES and RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES, just the former is not sufficient since libc may be enabled only for certain targets. --- libc/CMakeLists.txt | 17 +++++++++++++++-- llvm/CMakeLists.txt | 14 +++++++++++++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 616beae13d9aa..9f9839423499e 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -57,9 +57,21 @@ if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES) endif() endif() +set(NEED_LIBC_HDRGEN FALSE) +if(NOT LLVM_RUNTIMES_BUILD) + if("libc" IN_LIST LLVM_ENABLE_RUNTIMES) + set(NEED_LIBC_HDRGEN TRUE) + else() + foreach(_name ${LLVM_RUNTIME_TARGETS}) + if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES) + set(NEED_LIBC_HDRGEN TRUE) + break() + endif() + endforeach() + endif() +endif() option(LIBC_HDRGEN_ONLY "Only build the 'libc-hdrgen' executable" OFF) -if(("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT LLVM_RUNTIMES_BUILD) OR - LIBC_HDRGEN_ONLY) +if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN) # When libc is build as part of the runtimes/bootstrap build's CMake run, we # only need to build the host tools to build the libc. So, we just do enough # to build libc-hdrgen and return. @@ -70,6 +82,7 @@ if(("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT LLVM_RUNTIMES_BUILD) OR endif() return() endif() +unset(NEED_LIBC_HDRGEN) option(LIBC_CMAKE_VERBOSE_LOGGING "Log details warnings and notifications during CMake configuration." OFF) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 98cef005f07e0..dbd5fbf226bd5 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -168,7 +168,18 @@ foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES) endif() endforeach() -if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES) +set(NEED_LIBC_HDRGEN FALSE) +if("libc" IN_LIST LLVM_ENABLE_RUNTIMES) + set(NEED_LIBC_HDRGEN TRUE) +else() + foreach(_name ${LLVM_RUNTIME_TARGETS}) + if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES) + set(NEED_LIBC_HDRGEN TRUE) + break() + endif() + endforeach() +endif() +if(NEED_LIBC_HDRGEN) # To build the libc runtime, we need to be able to build few libc build # tools from the "libc" project. So, we add it to the list of enabled # projects. @@ -177,6 +188,7 @@ if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES) list(APPEND LLVM_ENABLE_PROJECTS "libc") endif() endif() +unset(NEED_LIBC_HDRGEN) # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the # `LLVM_ENABLE_PROJECTS` CMake cache variable. This exists for From cf8fc53a96f844328be8d20435c5b4151a7b8f92 Mon Sep 17 00:00:00 2001 From: agozillon Date: Thu, 22 Feb 2024 15:33:48 +0100 Subject: [PATCH 221/351] [Flang][LLVM][OpenMP] Relax target data restrictions to be more inline with the specification (#82537) Currently we emit errors whenever a map is not provided on a target data directive, however, I believe that's incorrect behavior, the specification states: "At least one map, use_device_addr or use_device_ptr clause must appear on the directive" So provided one is present, the directive is legal in this case. Slightly different to its siblings (enter/exit/update) which don't have use_device_addr/use_device_ptr. --- flang/test/Semantics/OpenMP/device-constructs.f90 | 12 +++++++++++- llvm/include/llvm/Frontend/OpenMP/OMP.td | 8 +++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/flang/test/Semantics/OpenMP/device-constructs.f90 b/flang/test/Semantics/OpenMP/device-constructs.f90 index 51f00700b6daf..1ac00ef922c6b 100644 --- a/flang/test/Semantics/OpenMP/device-constructs.f90 +++ b/flang/test/Semantics/OpenMP/device-constructs.f90 @@ -2,9 +2,11 @@ ! Check OpenMP clause validity for the following directives: ! 2.10 Device constructs program main + use iso_c_binding real(8) :: arrayA(256), arrayB(256) integer :: N + type(c_ptr) :: cptr arrayA = 1.414 arrayB = 3.14 @@ -135,7 +137,15 @@ program main enddo !$omp end target data - !ERROR: At least one of MAP clause must appear on the TARGET DATA directive + !$omp target data device(0) use_device_addr(cptr) + cptr = c_null_ptr + !$omp end target data + + !$omp target data device(0) use_device_addr(cptr) + cptr = c_null_ptr + !$omp end target data + + !ERROR: At least one of MAP, USE_DEVICE_ADDR, USE_DEVICE_PTR clause must appear on the TARGET DATA directive !$omp target data device(0) do i = 1, N a = 3.14 diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 1481328bf483b..77d207f2b10a8 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -710,16 +710,14 @@ def OMP_Requires : Directive<"requires"> { } def OMP_Nothing : Directive<"nothing"> {} def OMP_TargetData : Directive<"target data"> { - let allowedClauses = [ - VersionedClause, - VersionedClause - ]; let allowedOnceClauses = [ VersionedClause, VersionedClause ]; let requiredClauses = [ - VersionedClause + VersionedClause, + VersionedClause, + VersionedClause ]; } def OMP_TargetEnterData : Directive<"target enter data"> { From 27498e9942dbb8dd005588a03d6777088d2255ce Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 22 Feb 2024 14:35:05 +0000 Subject: [PATCH 222/351] [Flang][OpenMP] Prevent ICE for certain constructs in unnamed programs (#73938) This patch fixes #72748 by modifying the processing of program units to search for a symbol to which OpenMP REQUIRES clauses can bind to. Rather than picking up the first PFT node with a source reference and getting its associated scope, it picks up the last one. This avoids using the source from the first specification construct of a nameless program, which can sometimes not be associated to any scope, causing an ICE due to an invalid source location. --- flang/lib/Semantics/resolve-directives.cpp | 2 +- flang/test/Semantics/OpenMP/struct.f90 | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 flang/test/Semantics/OpenMP/struct.f90 diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index a826f0181e580..215a3c9060a24 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -26,7 +26,7 @@ template static Fortran::semantics::Scope *GetScope( Fortran::semantics::SemanticsContext &context, const T &x) { - std::optional source{GetSource(x)}; + std::optional source{GetLastSource(x)}; return source ? &context.FindScope(*source) : nullptr; } diff --git a/flang/test/Semantics/OpenMP/struct.f90 b/flang/test/Semantics/OpenMP/struct.f90 new file mode 100644 index 0000000000000..8ae1fbe4da86f --- /dev/null +++ b/flang/test/Semantics/OpenMP/struct.f90 @@ -0,0 +1,7 @@ +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp +! Check OpenMP compatibility with the DEC STRUCTURE extension + +structure /s/ +end structure + +end From 8e28037374934c60602cb8c85874f443e3348b9e Mon Sep 17 00:00:00 2001 From: Kai Nacke Date: Thu, 22 Feb 2024 09:52:44 -0500 Subject: [PATCH 223/351] [SystemZ] Add SystemZ path for the PR labeler (#82515) Similar to #82200: Add paths for SystemZ related changes to the PR labeler. There is no pr-subscribers-backend:SystemZ team in the llvm org yet. Much appreciated if some admin can help to create the team. --- .github/new-prs-labeler.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 7a37a96d6e381..8ed976fbdddc6 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -846,6 +846,26 @@ backend:PowerPC: - clang/lib/Driver/ToolChains/Arch/PPC.* - clang/test/CodeGen/PowerPC/** +backend:SystemZ: + - llvm/include/llvm/BinaryFormat/ELFRelocs/SystemZ* + - llvm/include/llvm/BinaryFormat/GOFF.h + - llvm/include/llvm/IR/IntrinsicsSystemZ.td + - llvm/lib/Target/SystemZ/** + - llvm/test/Analysis/**/SystemZ/** + - llvm/test/CodeGen/SystemZ/** + - llvm/test/DebugInfo/SystemZ/** + - llvm/test/ExecutionEngine/**/SystemZ/** + - llvm/test/MC/Disassembler/SystemZ/** + - llvm/test/MC/GOFF/** + - llvm/test/MC/SystemZ/** + - llvm/test/Transforms/**/SystemZ/** + - clang/include/clang/Basic/BuiltinsSystemZ.* + - clang/lib/Basic/Targets/SystemZ.* + - clang/lib/CodeGen/Targets/SystemZ.cpp + - clang/lib/Driver/ToolChains/ZOS* + - clang/lib/Driver/ToolChains/Arch/SystemZ.* + - clang/test/CodeGen/SystemZ/** + third-party:unittests: - third-party/unittests/** From 307409a8872ff27339d5d5c6a7e7777254972f34 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 22 Feb 2024 14:59:50 +0000 Subject: [PATCH 224/351] [flang] Fix warning fix This fixes 73c646a3b27293f8cb4ba120de7bc01c223b4b5f. I misread the #ifdefs and didn't realise that they were in the middle of passing parameters to a function. Move the workaround outside this. --- flang/lib/Evaluate/fold-integer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp index 09b2f91debda2..25ae4831ab208 100644 --- a/flang/lib/Evaluate/fold-integer.cpp +++ b/flang/lib/Evaluate/fold-integer.cpp @@ -704,6 +704,7 @@ Expr> FoldIntrinsicFunction( return common::visit( [&funcRef, &context, &FromInt64](const auto &str) -> Expr { using Char = typename std::decay_t::Result; + (void)FromInt64; return FoldElementalIntrinsic(context, std::move(funcRef), ScalarFunc( @@ -719,7 +720,6 @@ Expr> FoldIntrinsicFunction( // CharacterUtils<2>::ICHAR(). Can't find a work-around, // so remove the FromInt64 error checking lambda that // seems to have caused the proble. - (void)FromInt64; [](const Scalar &c) { return CharacterUtils::ICHAR( CharacterUtils::Resize(c, 1)); From 20434bf3731389773fb8569889bd5d06375683bf Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Thu, 22 Feb 2024 15:12:43 +0000 Subject: [PATCH 225/351] [RemoveDIs][NFC] Add DPLabel class [2/3] (#82376) Patch 2 of 3 to add llvm.dbg.label support to the RemoveDIs project. The patch stack adds the DPLabel class, which is the RemoveDIs llvm.dbg.label equivalent. 1. Add DbgRecord base class for DPValue and the not-yet-added DPLabel class. -> 2. Add the DPLabel class. 3. Enable dbg.label conversion and add support to passes. This will be used (and tested) in the final patch(es), coming next. --- .../include/llvm/IR/DebugProgramInstruction.h | 32 ++++++++++++-- llvm/lib/IR/AsmWriter.cpp | 43 +++++++++++++++++-- llvm/lib/IR/DebugProgramInstruction.cpp | 23 +++++++--- 3 files changed, 85 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index 1fa6b6f519640..1c8619741eb69 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -79,14 +79,13 @@ class raw_ostream; /// deleteRecord /// clone /// isIdenticalToWhenDefined -/// isEquivalentTo /// both print methods class DbgRecord : public ilist_node { public: /// Marker that this DbgRecord is linked into. DPMarker *Marker = nullptr; /// Subclass discriminator. - enum Kind : uint8_t { ValueKind }; + enum Kind : uint8_t { ValueKind, LabelKind }; protected: DebugLoc DbgLoc; @@ -104,9 +103,11 @@ class DbgRecord : public ilist_node { void print(raw_ostream &O, bool IsForDebug = false) const; void print(raw_ostream &O, ModuleSlotTracker &MST, bool IsForDebug) const; bool isIdenticalToWhenDefined(const DbgRecord &R) const; - bool isEquivalentTo(const DbgRecord &R) const; ///@} + /// Same as isIdenticalToWhenDefined but checks DebugLoc too. + bool isEquivalentTo(const DbgRecord &R) const; + Kind getRecordKind() const { return RecordKind; } void setMarker(DPMarker *M) { Marker = M; } @@ -156,6 +157,31 @@ class DbgRecord : public ilist_node { ~DbgRecord() = default; }; +/// Records a position in IR for a source label (DILabel). Corresponds to the +/// llvm.dbg.label intrinsic. +/// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord. +class DPLabel : public DbgRecord { + DILabel *Label; + +public: + DPLabel(DILabel *Label, DebugLoc DL) + : DbgRecord(LabelKind, DL), Label(Label) { + assert(Label && "Unexpected nullptr"); + } + + DPLabel *clone() const; + void print(raw_ostream &O, bool IsForDebug = false) const; + void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const; + + void setLabel(DILabel *NewLabel) { Label = NewLabel; } + DILabel *getLabel() const { return Label; } + + /// Support type inquiry through isa, cast, and dyn_cast. + static bool classof(const DbgRecord *E) { + return E->getRecordKind() == LabelKind; + } +}; + /// Record of a variable value-assignment, aka a non instruction representation /// of the dbg.value intrinsic. /// diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index ac0f119b00bde..c2a470c5fc716 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -292,8 +292,8 @@ static const Module *getModuleFromDPI(const DPMarker *Marker) { return M ? M->getParent() : nullptr; } -static const Module *getModuleFromDPI(const DPValue *DPV) { - return DPV->getMarker() ? getModuleFromDPI(DPV->getMarker()) : nullptr; +static const Module *getModuleFromDPI(const DbgRecord *DR) { + return DR->getMarker() ? getModuleFromDPI(DR->getMarker()) : nullptr; } static void PrintCallingConv(unsigned cc, raw_ostream &Out) { @@ -2699,6 +2699,7 @@ class AssemblyWriter { void printInstruction(const Instruction &I); void printDPMarker(const DPMarker &DPI); void printDPValue(const DPValue &DPI); + void printDPLabel(const DPLabel &DPL); void printDbgRecord(const DbgRecord &DPI); void printUseListOrder(const Value *V, const std::vector &Shuffle); @@ -4602,8 +4603,10 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) { void AssemblyWriter::printDbgRecord(const DbgRecord &DR) { if (auto *DPV = dyn_cast(&DR)) printDPValue(*DPV); + else if (auto *DPL = dyn_cast(&DR)) + printDPLabel(*DPL); else - llvm_unreachable("unsupported dbg record"); + llvm_unreachable("Unexpected DbgRecord kind"); } void AssemblyWriter::printDPValue(const DPValue &Value) { @@ -4645,6 +4648,16 @@ void AssemblyWriter::printDPValue(const DPValue &Value) { Out << " }"; } +void AssemblyWriter::printDPLabel(const DPLabel &Label) { + // There's no formal representation of a DPLabel -- print purely as + // a debugging aid. + Out << " DPLabel { "; + auto WriterCtx = getContext(); + WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true); + Out << " marker @" << Label.getMarker(); + Out << " }"; +} + void AssemblyWriter::printMetadataAttachments( const SmallVectorImpl> &MDs, StringRef Separator) { @@ -4908,6 +4921,12 @@ void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST, W.printDPMarker(*this); } +void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const { + + ModuleSlotTracker MST(getModuleFromDPI(this), true); + print(ROS, MST, IsForDebug); +} + void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const { // There's no formal representation of a DPValue -- print purely as a @@ -4927,6 +4946,24 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST, W.printDPValue(*this); } +void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST, + bool IsForDebug) const { + // There's no formal representation of a DbgLabelRecord -- print purely as + // a debugging aid. + formatted_raw_ostream OS(ROS); + SlotTracker EmptySlotTable(static_cast(nullptr)); + SlotTracker &SlotTable = + MST.getMachine() ? *MST.getMachine() : EmptySlotTable; + auto incorporateFunction = [&](const Function *F) { + if (F) + MST.incorporateFunction(*F); + }; + incorporateFunction(Marker->getParent() ? Marker->getParent()->getParent() + : nullptr); + AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug); + W.printDPLabel(*this); +} + void Value::print(raw_ostream &ROS, bool IsForDebug) const { bool ShouldInitializeAllMetadata = false; if (auto *I = dyn_cast(this)) diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp index eb18be5d817a9..2ca4533afa96c 100644 --- a/llvm/lib/IR/DebugProgramInstruction.cpp +++ b/llvm/lib/IR/DebugProgramInstruction.cpp @@ -64,6 +64,9 @@ void DbgRecord::deleteRecord() { case ValueKind: delete cast(this); return; + case LabelKind: + delete cast(this); + return; } llvm_unreachable("unsupported DbgRecord kind"); } @@ -73,6 +76,9 @@ void DbgRecord::print(raw_ostream &O, bool IsForDebug) const { case ValueKind: cast(this)->print(O, IsForDebug); return; + case LabelKind: + cast(this)->print(O, IsForDebug); + return; }; llvm_unreachable("unsupported DbgRecord kind"); } @@ -83,6 +89,9 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST, case ValueKind: cast(this)->print(O, MST, IsForDebug); return; + case LabelKind: + cast(this)->print(O, MST, IsForDebug); + return; }; llvm_unreachable("unsupported DbgRecord kind"); } @@ -93,18 +102,14 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const { switch (RecordKind) { case ValueKind: return cast(this)->isIdenticalToWhenDefined(*cast(&R)); + case LabelKind: + return cast(this)->getLabel() == cast(R).getLabel(); }; llvm_unreachable("unsupported DbgRecord kind"); } bool DbgRecord::isEquivalentTo(const DbgRecord &R) const { - if (RecordKind != R.RecordKind) - return false; - switch (RecordKind) { - case ValueKind: - return cast(this)->isEquivalentTo(*cast(&R)); - }; - llvm_unreachable("unsupported DbgRecord kind"); + return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R); } DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV, @@ -307,12 +312,16 @@ DbgRecord *DbgRecord::clone() const { switch (RecordKind) { case ValueKind: return cast(this)->clone(); + case LabelKind: + return cast(this)->clone(); }; llvm_unreachable("unsupported DbgRecord kind"); } DPValue *DPValue::clone() const { return new DPValue(*this); } +DPLabel *DPLabel::clone() const { return new DPLabel(Label, getDebugLoc()); } + DbgVariableIntrinsic * DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const { [[maybe_unused]] DICompileUnit *Unit = From 601c9bec736739da9160092ef60e3468266816bd Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Thu, 22 Feb 2024 15:25:36 +0000 Subject: [PATCH 226/351] [clang][NFC] Fix arm_acle.h title headers (#82624) Fix some title headers to align them with the actual ACLE document. --- clang/lib/Headers/arm_acle.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h index 9cd34948e3c53..6e557eda1dddc 100644 --- a/clang/lib/Headers/arm_acle.h +++ b/clang/lib/Headers/arm_acle.h @@ -313,7 +313,7 @@ __qdbl(int32_t __t) { } #endif -/* 8.4.3 Accumultating multiplications */ +/* 8.4.3 Accumulating multiplications */ #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __smlabb(int32_t __a, int32_t __b, int32_t __c) { @@ -545,7 +545,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) { } #endif -/* 8.5.10 Parallel 16-bit multiplications */ +/* 8.5.10 Parallel 16-bit multiplication */ #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) { @@ -748,7 +748,7 @@ __arm_st64bv0(void *__addr, data512_t __value) { #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v)) #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v)) -/* 10.3 Memory Tagging Extensions (MTE) Intrinsics */ +/* 10.3 MTE intrinsics */ #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE #define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask) #define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset) @@ -757,7 +757,7 @@ __arm_st64bv0(void *__addr, data512_t __value) { #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr) #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb) -/* 18 Memory Operations Intrinsics */ +/* 18 memcpy family of operations intrinsics - MOPS */ #define __arm_mops_memset_tag(__tagged_address, __value, __size) \ __builtin_arm_mops_memset_tag(__tagged_address, __value, __size) #endif From 08eced5fccd2f103379292f119834a7a3c3b6b25 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 22 Feb 2024 15:29:04 +0000 Subject: [PATCH 227/351] [mlir][test] Add -march=aarch64 -mattr=+sve to test-scalable-interleave Fix for https://lab.llvm.org/buildbot/#/builders/179/builds/9438 --- .../Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir index 8ae3eee6462ca..07989bd71f501 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir @@ -1,6 +1,7 @@ // RUN: mlir-opt %s -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd -e entry -entry-point-result=void \ -// RUN: -shared-libs=%mlir_c_runner_utils,%mlir_arm_runner_utils | \ +// RUN: -shared-libs=%mlir_c_runner_utils,%mlir_arm_runner_utils \ +// RUN: -march=aarch64 -mattr=+sve | \ // RUN: FileCheck %s func.func @entry() { From 695a9d84dc1dd003c31d3e5e22af3525c31218c2 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 22 Feb 2024 16:00:33 +0000 Subject: [PATCH 228/351] LoopVectorize: add test for crash in #72969 (#74111) --- .../Transforms/LoopVectorize/X86/pr72969.ll | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr72969.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll new file mode 100644 index 0000000000000..a54bd39f3ff60 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll @@ -0,0 +1,25 @@ +; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s +; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s + +@h = global i64 0 + +define void @test(ptr %p) { +entry: + br label %for.body + +for.body: + %idx.ext.merge = phi i64 [ 1, %entry ], [ %idx, %for.body ] + %inc.merge = phi i16 [ 1, %entry ], [ %inc, %for.body ] + %idx.merge = phi i64 [ 0, %entry ], [ %idx.ext.merge, %for.body ] + %add = shl i64 %idx.merge, 1 + %arrayidx = getelementptr i64, ptr %p, i64 %add + store i64 0, ptr %arrayidx + %inc = add i16 %inc.merge, 1 + %idx = zext i16 %inc to i64 + %gep = getelementptr i64, ptr %p, i64 %idx + %cmp = icmp ugt ptr %gep, @h + br i1 %cmp, label %exit, label %for.body + +exit: + ret void +} From 9eb5f94f9b47154cf07160a6ba74ab1c31becfa3 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Feb 2024 07:54:51 -0800 Subject: [PATCH 229/351] [RISCV][AArch64] Add vscale_range attribute to tests per architecture minimums Spent a bunch of time tracing down an odd issue "in SCEV" which turned out to be the fact that SCEV doesn't have access to TTI. As a result, the only way for it to get range facts on vscales (to avoid collapsing ranges of element counts and type sizes to trivial ranges on multiplies) is to look at the vscale_range attribute. Since vscale_range is set by clang by default, manually setting it in the tests shouldn't interfere with the test intent. --- .../AArch64/clamped-trip-count.ll | 90 +++++++++---------- .../LoopVectorize/RISCV/low-trip-count.ll | 2 +- 2 files changed, 41 insertions(+), 51 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll index 44ace377ac792..3e895edcd4f4f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s -define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ +define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,16) { ; CHECK-LABEL: define void @clamped_tc_8( ; CHECK-SAME: ptr nocapture [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -18,20 +18,15 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = sub i64 8, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 8, [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) -; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv8i64() -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP16]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv8i64() +; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP11]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[VAL]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -40,17 +35,17 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP20:%.*]] = lshr [[BROADCAST_SPLAT]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = trunc [[TMP20]] to -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP21]], ptr [[TMP22]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = lshr [[BROADCAST_SPLAT]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc [[TMP15]] to +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP16]], ptr [[TMP17]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8) ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -61,8 +56,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP24]] +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]] ; CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8 ; CHECK-NEXT: store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1 @@ -91,7 +86,7 @@ for.cond.cleanup: ; preds = %for.body ret void } -define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){ +define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,16) { ; CHECK-LABEL: define void @clamped_tc_max_8( ; CHECK-SAME: ptr nocapture [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: @@ -115,20 +110,15 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){ ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) -; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv8i64() -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP16]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv8i64() +; CHECK-NEXT: [[TMP8:%.*]] = add [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP11]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[VAL]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -137,17 +127,17 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP20:%.*]] = lshr [[BROADCAST_SPLAT]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = trunc [[TMP20]] to -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP21]], ptr [[TMP22]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = lshr [[BROADCAST_SPLAT]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc [[TMP15]] to +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP16]], ptr [[TMP17]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -158,8 +148,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP24]] +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]] ; CHECK-NEXT: [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8 ; CHECK-NEXT: store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 0c5394cb95a61..acb4489bd76b0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -74,4 +74,4 @@ for.end: ; preds = %for.body ret void } -attributes #0 = { "target-features"="+v,+d" } +attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) } From 0107c8824b695db86706bbc3466bbfd585a754aa Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 23 Feb 2024 00:18:56 +0800 Subject: [PATCH 230/351] [RISCV][SDAG] Improve codegen of select with constants if zicond is available (#82456) This patch uses `add + czero.eqz/nez` to lower select with constants if zicond is available. ``` (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1) (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2) ``` The above code sequence is suggested by [RISCV Optimization Guide](https://riscv-optimization-guide-riseproject-c94355ae3e6872252baa952524.gitlab.io/riscv-optimization-guide.html#_avoid_branches_using_conditional_moves). --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 20 ++ llvm/test/CodeGen/RISCV/select.ll | 252 +++++++++++++++++++- 2 files changed, 262 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index cf0dc36a51b61..6bf02cf8c0f87 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7379,6 +7379,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget)) return V; + // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1) + // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2) + if (isa(TrueV) && isa(FalseV)) { + const APInt &TrueVal = TrueV->getAsAPIntVal(); + const APInt &FalseVal = FalseV->getAsAPIntVal(); + const int TrueValCost = RISCVMatInt::getIntMatCost( + TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + const int FalseValCost = RISCVMatInt::getIntMatCost( + FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true); + bool IsCZERO_NEZ = TrueValCost <= FalseValCost; + SDValue LHSVal = DAG.getConstant( + IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT); + SDValue RHSVal = + DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT); + SDValue CMOV = + DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ, + DL, VT, LHSVal, CondV); + return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal); + } + // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c)) // Unless we have the short forward branch optimization. if (!Subtarget.hasConditionalMoveFusion()) diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll index e01984b7c5843..e07e52091e9e7 100644 --- a/llvm/test/CodeGen/RISCV/select.ll +++ b/llvm/test/CodeGen/RISCV/select.ll @@ -1606,23 +1606,255 @@ define i32 @select_cst_unknown(i32 signext %a, i32 signext %b) { ; RV64IMXVTCONDOPS-LABEL: select_cst_unknown: ; RV64IMXVTCONDOPS: # %bb.0: ; RV64IMXVTCONDOPS-NEXT: slt a0, a0, a1 -; RV64IMXVTCONDOPS-NEXT: li a1, -7 -; RV64IMXVTCONDOPS-NEXT: vt.maskcn a1, a1, a0 -; RV64IMXVTCONDOPS-NEXT: li a2, 5 -; RV64IMXVTCONDOPS-NEXT: vt.maskc a0, a2, a0 -; RV64IMXVTCONDOPS-NEXT: or a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: li a1, -12 +; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a1, a0 +; RV64IMXVTCONDOPS-NEXT: addi a0, a0, 5 ; RV64IMXVTCONDOPS-NEXT: ret ; ; CHECKZICOND-LABEL: select_cst_unknown: ; CHECKZICOND: # %bb.0: ; CHECKZICOND-NEXT: slt a0, a0, a1 -; CHECKZICOND-NEXT: li a1, -7 -; CHECKZICOND-NEXT: czero.nez a1, a1, a0 -; CHECKZICOND-NEXT: li a2, 5 -; CHECKZICOND-NEXT: czero.eqz a0, a2, a0 -; CHECKZICOND-NEXT: or a0, a0, a1 +; CHECKZICOND-NEXT: li a1, -12 +; CHECKZICOND-NEXT: czero.nez a0, a1, a0 +; CHECKZICOND-NEXT: addi a0, a0, 5 ; CHECKZICOND-NEXT: ret %cond = icmp slt i32 %a, %b %ret = select i1 %cond, i32 5, i32 -7 ret i32 %ret } + +define i32 @select_cst1(i1 zeroext %cond) { +; RV32IM-LABEL: select_cst1: +; RV32IM: # %bb.0: +; RV32IM-NEXT: mv a1, a0 +; RV32IM-NEXT: li a0, 10 +; RV32IM-NEXT: bnez a1, .LBB43_2 +; RV32IM-NEXT: # %bb.1: +; RV32IM-NEXT: li a0, 20 +; RV32IM-NEXT: .LBB43_2: +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst1: +; RV64IM: # %bb.0: +; RV64IM-NEXT: mv a1, a0 +; RV64IM-NEXT: li a0, 10 +; RV64IM-NEXT: bnez a1, .LBB43_2 +; RV64IM-NEXT: # %bb.1: +; RV64IM-NEXT: li a0, 20 +; RV64IM-NEXT: .LBB43_2: +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst1: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: li a1, 10 +; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a1, a0 +; RV64IMXVTCONDOPS-NEXT: addi a0, a0, 10 +; RV64IMXVTCONDOPS-NEXT: ret +; +; CHECKZICOND-LABEL: select_cst1: +; CHECKZICOND: # %bb.0: +; CHECKZICOND-NEXT: li a1, 10 +; CHECKZICOND-NEXT: czero.nez a0, a1, a0 +; CHECKZICOND-NEXT: addi a0, a0, 10 +; CHECKZICOND-NEXT: ret + %ret = select i1 %cond, i32 10, i32 20 + ret i32 %ret +} + +define i32 @select_cst2(i1 zeroext %cond) { +; RV32IM-LABEL: select_cst2: +; RV32IM: # %bb.0: +; RV32IM-NEXT: mv a1, a0 +; RV32IM-NEXT: li a0, 10 +; RV32IM-NEXT: bnez a1, .LBB44_2 +; RV32IM-NEXT: # %bb.1: +; RV32IM-NEXT: lui a0, 5 +; RV32IM-NEXT: addi a0, a0, -480 +; RV32IM-NEXT: .LBB44_2: +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst2: +; RV64IM: # %bb.0: +; RV64IM-NEXT: mv a1, a0 +; RV64IM-NEXT: li a0, 10 +; RV64IM-NEXT: bnez a1, .LBB44_2 +; RV64IM-NEXT: # %bb.1: +; RV64IM-NEXT: lui a0, 5 +; RV64IM-NEXT: addiw a0, a0, -480 +; RV64IM-NEXT: .LBB44_2: +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst2: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: lui a1, 5 +; RV64IMXVTCONDOPS-NEXT: addiw a1, a1, -490 +; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a1, a0 +; RV64IMXVTCONDOPS-NEXT: addi a0, a0, 10 +; RV64IMXVTCONDOPS-NEXT: ret +; +; RV32IMZICOND-LABEL: select_cst2: +; RV32IMZICOND: # %bb.0: +; RV32IMZICOND-NEXT: lui a1, 5 +; RV32IMZICOND-NEXT: addi a1, a1, -490 +; RV32IMZICOND-NEXT: czero.nez a0, a1, a0 +; RV32IMZICOND-NEXT: addi a0, a0, 10 +; RV32IMZICOND-NEXT: ret +; +; RV64IMZICOND-LABEL: select_cst2: +; RV64IMZICOND: # %bb.0: +; RV64IMZICOND-NEXT: lui a1, 5 +; RV64IMZICOND-NEXT: addiw a1, a1, -490 +; RV64IMZICOND-NEXT: czero.nez a0, a1, a0 +; RV64IMZICOND-NEXT: addi a0, a0, 10 +; RV64IMZICOND-NEXT: ret + %ret = select i1 %cond, i32 10, i32 20000 + ret i32 %ret +} + +define i32 @select_cst3(i1 zeroext %cond) { +; RV32IM-LABEL: select_cst3: +; RV32IM: # %bb.0: +; RV32IM-NEXT: bnez a0, .LBB45_2 +; RV32IM-NEXT: # %bb.1: +; RV32IM-NEXT: lui a0, 5 +; RV32IM-NEXT: addi a0, a0, -480 +; RV32IM-NEXT: ret +; RV32IM-NEXT: .LBB45_2: +; RV32IM-NEXT: lui a0, 7 +; RV32IM-NEXT: addi a0, a0, 1328 +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst3: +; RV64IM: # %bb.0: +; RV64IM-NEXT: bnez a0, .LBB45_2 +; RV64IM-NEXT: # %bb.1: +; RV64IM-NEXT: lui a0, 5 +; RV64IM-NEXT: addiw a0, a0, -480 +; RV64IM-NEXT: ret +; RV64IM-NEXT: .LBB45_2: +; RV64IM-NEXT: lui a0, 7 +; RV64IM-NEXT: addiw a0, a0, 1328 +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst3: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: lui a1, 1048574 +; RV64IMXVTCONDOPS-NEXT: addiw a1, a1, -1808 +; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a1, a0 +; RV64IMXVTCONDOPS-NEXT: lui a1, 7 +; RV64IMXVTCONDOPS-NEXT: addiw a1, a1, 1328 +; RV64IMXVTCONDOPS-NEXT: add a0, a0, a1 +; RV64IMXVTCONDOPS-NEXT: ret +; +; RV32IMZICOND-LABEL: select_cst3: +; RV32IMZICOND: # %bb.0: +; RV32IMZICOND-NEXT: lui a1, 1048574 +; RV32IMZICOND-NEXT: addi a1, a1, -1808 +; RV32IMZICOND-NEXT: czero.nez a0, a1, a0 +; RV32IMZICOND-NEXT: lui a1, 7 +; RV32IMZICOND-NEXT: addi a1, a1, 1328 +; RV32IMZICOND-NEXT: add a0, a0, a1 +; RV32IMZICOND-NEXT: ret +; +; RV64IMZICOND-LABEL: select_cst3: +; RV64IMZICOND: # %bb.0: +; RV64IMZICOND-NEXT: lui a1, 1048574 +; RV64IMZICOND-NEXT: addiw a1, a1, -1808 +; RV64IMZICOND-NEXT: czero.nez a0, a1, a0 +; RV64IMZICOND-NEXT: lui a1, 7 +; RV64IMZICOND-NEXT: addiw a1, a1, 1328 +; RV64IMZICOND-NEXT: add a0, a0, a1 +; RV64IMZICOND-NEXT: ret + %ret = select i1 %cond, i32 30000, i32 20000 + ret i32 %ret +} + +define i32 @select_cst4(i1 zeroext %cond) { +; CHECK-LABEL: select_cst4: +; CHECK: # %bb.0: +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: xori a0, a0, 2047 +; CHECK-NEXT: ret + %ret = select i1 %cond, i32 -2048, i32 2047 + ret i32 %ret +} + +define i32 @select_cst5(i1 zeroext %cond) { +; RV32IM-LABEL: select_cst5: +; RV32IM: # %bb.0: +; RV32IM-NEXT: mv a1, a0 +; RV32IM-NEXT: li a0, 2047 +; RV32IM-NEXT: bnez a1, .LBB47_2 +; RV32IM-NEXT: # %bb.1: +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a0, a0, -2047 +; RV32IM-NEXT: .LBB47_2: +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst5: +; RV64IM: # %bb.0: +; RV64IM-NEXT: mv a1, a0 +; RV64IM-NEXT: li a0, 2047 +; RV64IM-NEXT: bnez a1, .LBB47_2 +; RV64IM-NEXT: # %bb.1: +; RV64IM-NEXT: lui a0, 1 +; RV64IM-NEXT: addiw a0, a0, -2047 +; RV64IM-NEXT: .LBB47_2: +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst5: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: li a1, 2 +; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a1, a0 +; RV64IMXVTCONDOPS-NEXT: addi a0, a0, 2047 +; RV64IMXVTCONDOPS-NEXT: ret +; +; CHECKZICOND-LABEL: select_cst5: +; CHECKZICOND: # %bb.0: +; CHECKZICOND-NEXT: li a1, 2 +; CHECKZICOND-NEXT: czero.nez a0, a1, a0 +; CHECKZICOND-NEXT: addi a0, a0, 2047 +; CHECKZICOND-NEXT: ret + %ret = select i1 %cond, i32 2047, i32 2049 + ret i32 %ret +} + +define i32 @select_cst6(i1 zeroext %cond) { +; RV32IM-LABEL: select_cst6: +; RV32IM: # %bb.0: +; RV32IM-NEXT: bnez a0, .LBB48_2 +; RV32IM-NEXT: # %bb.1: +; RV32IM-NEXT: li a0, 2047 +; RV32IM-NEXT: ret +; RV32IM-NEXT: .LBB48_2: +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a0, a0, -2047 +; RV32IM-NEXT: ret +; +; RV64IM-LABEL: select_cst6: +; RV64IM: # %bb.0: +; RV64IM-NEXT: bnez a0, .LBB48_2 +; RV64IM-NEXT: # %bb.1: +; RV64IM-NEXT: li a0, 2047 +; RV64IM-NEXT: ret +; RV64IM-NEXT: .LBB48_2: +; RV64IM-NEXT: lui a0, 1 +; RV64IM-NEXT: addiw a0, a0, -2047 +; RV64IM-NEXT: ret +; +; RV64IMXVTCONDOPS-LABEL: select_cst6: +; RV64IMXVTCONDOPS: # %bb.0: +; RV64IMXVTCONDOPS-NEXT: li a1, 2 +; RV64IMXVTCONDOPS-NEXT: vt.maskc a0, a1, a0 +; RV64IMXVTCONDOPS-NEXT: addi a0, a0, 2047 +; RV64IMXVTCONDOPS-NEXT: ret +; +; CHECKZICOND-LABEL: select_cst6: +; CHECKZICOND: # %bb.0: +; CHECKZICOND-NEXT: li a1, 2 +; CHECKZICOND-NEXT: czero.eqz a0, a1, a0 +; CHECKZICOND-NEXT: addi a0, a0, 2047 +; CHECKZICOND-NEXT: ret + %ret = select i1 %cond, i32 2049, i32 2047 + ret i32 %ret +} From 43f1fa99ca7d05be9545a102e15ad0d607887839 Mon Sep 17 00:00:00 2001 From: cmtice Date: Thu, 22 Feb 2024 08:20:54 -0800 Subject: [PATCH 231/351] [LLVM][DebugInfo] Refactor some code for easier sharing. (#82153) Refactor the code that calculates the offsets for the various pieces of the DWARF .debug_names index section, to make it easier to share the code with other tools, such as LLD. --- .../DebugInfo/DWARF/DWARFAcceleratorTable.h | 26 +++++-- .../DebugInfo/DWARF/DWARFAcceleratorTable.cpp | 73 ++++++++++++------- 2 files changed, 65 insertions(+), 34 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h index a26c44bf7e9c2..d368c7e0ece8f 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h @@ -562,6 +562,17 @@ class DWARFDebugNames : public DWARFAcceleratorTable { uint64_t getEntryOffset() const { return EntryOffset; } }; + /// Offsets for the start of various important tables from the start of the + /// section. + struct DWARFDebugNamesOffsets { + uint64_t CUsBase; + uint64_t BucketsBase; + uint64_t HashesBase; + uint64_t StringOffsetsBase; + uint64_t EntryOffsetsBase; + uint64_t EntriesBase; + }; + /// Represents a single accelerator table within the DWARF v5 .debug_names /// section. class NameIndex { @@ -572,12 +583,7 @@ class DWARFDebugNames : public DWARFAcceleratorTable { // Base of the whole unit and of various important tables, as offsets from // the start of the section. uint64_t Base; - uint64_t CUsBase; - uint64_t BucketsBase; - uint64_t HashesBase; - uint64_t StringOffsetsBase; - uint64_t EntryOffsetsBase; - uint64_t EntriesBase; + DWARFDebugNamesOffsets Offsets; void dumpCUs(ScopedPrinter &W) const; void dumpLocalTUs(ScopedPrinter &W) const; @@ -638,7 +644,7 @@ class DWARFDebugNames : public DWARFAcceleratorTable { /// Returns the Entry at the relative `Offset` from the start of the Entry /// pool. Expected getEntryAtRelativeOffset(uint64_t Offset) const { - auto OffsetFromSection = Offset + this->EntriesBase; + auto OffsetFromSection = Offset + this->Offsets.EntriesBase; return getEntry(&OffsetFromSection); } @@ -793,6 +799,12 @@ class DWARFDebugNames : public DWARFAcceleratorTable { const NameIndex *getCUNameIndex(uint64_t CUOffset); }; +/// Calculates the starting offsets for various sections within the +/// .debug_names section. +void findDebugNamesOffsets(DWARFDebugNames::DWARFDebugNamesOffsets &Offsets, + uint64_t HdrSize, const dwarf::DwarfFormat Format, + const DWARFDebugNames::Header &Hdr); + /// If `Name` is the name of a templated function that includes template /// parameters, returns a substring of `Name` containing no template /// parameters. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 78f819dd052aa..9c65d85985f1b 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -510,7 +510,7 @@ DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() { Expected DWARFDebugNames::NameIndex::extractAttributeEncoding(uint64_t *Offset) { - if (*Offset >= EntriesBase) { + if (*Offset >= Offsets.EntriesBase) { return createStringError(errc::illegal_byte_sequence, "Incorrectly terminated abbreviation table."); } @@ -536,7 +536,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncodings(uint64_t *Offset) { Expected DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) { - if (*Offset >= EntriesBase) { + if (*Offset >= Offsets.EntriesBase) { return createStringError(errc::illegal_byte_sequence, "Incorrectly terminated abbreviation table."); } @@ -552,32 +552,50 @@ DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) { return Abbrev(Code, dwarf::Tag(Tag), AbbrevOffset, std::move(*AttrEncOr)); } +void llvm::findDebugNamesOffsets( + DWARFDebugNames::DWARFDebugNamesOffsets &Offsets, uint64_t HdrSize, + dwarf::DwarfFormat Format, const DWARFDebugNames::Header &Hdr) { + uint32_t DwarfSize = (Format == llvm::dwarf::DwarfFormat::DWARF64) ? 8 : 4; + uint64_t Offset = HdrSize; + Offsets.CUsBase = Offset; + Offset += Hdr.CompUnitCount * DwarfSize; + Offset += Hdr.LocalTypeUnitCount * DwarfSize; + Offset += Hdr.ForeignTypeUnitCount * 8; + + Offsets.BucketsBase = Offset; + Offset += Hdr.BucketCount * 4; + + Offsets.HashesBase = Offset; + if (Hdr.BucketCount > 0) + Offset += Hdr.NameCount * 4; + + Offsets.StringOffsetsBase = Offset; + Offset += Hdr.NameCount * DwarfSize; + + Offsets.EntryOffsetsBase = Offset; + Offset += Hdr.NameCount * DwarfSize; + + Offset += Hdr.AbbrevTableSize; + Offsets.EntriesBase = Offset; +} + Error DWARFDebugNames::NameIndex::extract() { const DWARFDataExtractor &AS = Section.AccelSection; - uint64_t Offset = Base; - if (Error E = Hdr.extract(AS, &Offset)) + uint64_t hdrSize = Base; + if (Error E = Hdr.extract(AS, &hdrSize)) return E; const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - CUsBase = Offset; - Offset += Hdr.CompUnitCount * SectionOffsetSize; - Offset += Hdr.LocalTypeUnitCount * SectionOffsetSize; - Offset += Hdr.ForeignTypeUnitCount * 8; - BucketsBase = Offset; - Offset += Hdr.BucketCount * 4; - HashesBase = Offset; - if (Hdr.BucketCount > 0) - Offset += Hdr.NameCount * 4; - StringOffsetsBase = Offset; - Offset += Hdr.NameCount * SectionOffsetSize; - EntryOffsetsBase = Offset; - Offset += Hdr.NameCount * SectionOffsetSize; + findDebugNamesOffsets(Offsets, hdrSize, Hdr.Format, Hdr); + + uint64_t Offset = + Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize); if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize)) return createStringError(errc::illegal_byte_sequence, "Section too small: cannot read abbreviations."); - EntriesBase = Offset + Hdr.AbbrevTableSize; + Offsets.EntriesBase = Offset + Hdr.AbbrevTableSize; for (;;) { auto AbbrevOr = extractAbbrev(&Offset); @@ -679,7 +697,7 @@ void DWARFDebugNames::Entry::dumpParentIdx( return; } - auto AbsoluteOffset = NameIdx->EntriesBase + FormValue.getRawUValue(); + auto AbsoluteOffset = NameIdx->Offsets.EntriesBase + FormValue.getRawUValue(); W.getOStream() << "Entry @ 0x" + Twine::utohexstr(AbsoluteOffset); } @@ -708,14 +726,15 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const { uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const { assert(CU < Hdr.CompUnitCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - uint64_t Offset = CUsBase + SectionOffsetSize * CU; + uint64_t Offset = Offsets.CUsBase + SectionOffsetSize * CU; return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset); } uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const { assert(TU < Hdr.LocalTypeUnitCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - uint64_t Offset = CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU); + uint64_t Offset = + Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU); return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset); } @@ -723,7 +742,7 @@ uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const { assert(TU < Hdr.ForeignTypeUnitCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); uint64_t Offset = - CUsBase + + Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU; return Section.AccelSection.getU64(&Offset); } @@ -759,28 +778,28 @@ DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const { assert(0 < Index && Index <= Hdr.NameCount); const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); uint64_t StringOffsetOffset = - StringOffsetsBase + SectionOffsetSize * (Index - 1); + Offsets.StringOffsetsBase + SectionOffsetSize * (Index - 1); uint64_t EntryOffsetOffset = - EntryOffsetsBase + SectionOffsetSize * (Index - 1); + Offsets.EntryOffsetsBase + SectionOffsetSize * (Index - 1); const DWARFDataExtractor &AS = Section.AccelSection; uint64_t StringOffset = AS.getRelocatedValue(SectionOffsetSize, &StringOffsetOffset); uint64_t EntryOffset = AS.getUnsigned(&EntryOffsetOffset, SectionOffsetSize); - EntryOffset += EntriesBase; + EntryOffset += Offsets.EntriesBase; return {Section.StringSection, Index, StringOffset, EntryOffset}; } uint32_t DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const { assert(Bucket < Hdr.BucketCount); - uint64_t BucketOffset = BucketsBase + 4 * Bucket; + uint64_t BucketOffset = Offsets.BucketsBase + 4 * Bucket; return Section.AccelSection.getU32(&BucketOffset); } uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const { assert(0 < Index && Index <= Hdr.NameCount); - uint64_t HashOffset = HashesBase + 4 * (Index - 1); + uint64_t HashOffset = Offsets.HashesBase + 4 * (Index - 1); return Section.AccelSection.getU32(&HashOffset); } From f67ef1a8d9841718ce08a69d935ac8fd8e6112f9 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Feb 2024 08:24:38 -0800 Subject: [PATCH 232/351] [RISCV][LV] Add additional small trip count loop coverage --- .../LoopVectorize/RISCV/low-trip-count.ll | 368 +++++++++++++++++- 1 file changed, 366 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index acb4489bd76b0..7ccbc98d26567 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -3,6 +3,116 @@ target triple = "riscv64" +define void @trip1_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip1_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, ptr %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 1 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip3_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 3) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, ptr %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 3 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( ; CHECK-NEXT: entry: @@ -33,7 +143,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -50,7 +160,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -74,4 +184,258 @@ for.end: ; preds = %for.body ret void } +define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip8_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP7]], i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, ptr %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 8 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip16_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, ptr %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 16 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + + +define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip32_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP7]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, ptr %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 32 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip24_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP8]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP9]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08 + %1 = load i8, ptr %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, ptr %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 24 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) } + From c9afd1ad783a67210bed4fd2f7108477fc986e15 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 21 Feb 2024 23:48:19 -0800 Subject: [PATCH 233/351] [RISCV] Add test case showing missed opportunity to form sextload when sext and zext nneg are both present. NFC --- llvm/test/CodeGen/RISCV/sext-zext-trunc.ll | 84 ++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll index a2a953ca882ba..09516d91771ca 100644 --- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll +++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll @@ -871,3 +871,87 @@ define void @zext_nneg_dominating_icmp_i32_zeroext(i16 signext %0) { 5: ret void } + +; The load is used extended and non-extended in the successor basic block. The +; signed compare will cause the non-extended value to exported out of the first +; basic block using a sext to XLen. We need to CSE the zext nneg with the sext +; so that we can form a sextload. +define void @load_zext_nneg_sext_cse(ptr %p) nounwind { +; RV32I-LABEL: load_zext_nneg_sext_cse: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: lhu s0, 0(a0) +; RV32I-NEXT: slli a0, s0, 16 +; RV32I-NEXT: bltz a0, .LBB50_2 +; RV32I-NEXT: # %bb.1: # %bb1 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: call bar_i16 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: tail bar_i32 +; RV32I-NEXT: .LBB50_2: # %bb2 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: load_zext_nneg_sext_cse: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: lhu s0, 0(a0) +; RV64I-NEXT: slli a0, s0, 48 +; RV64I-NEXT: bltz a0, .LBB50_2 +; RV64I-NEXT: # %bb.1: # %bb1 +; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: call bar_i16 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: tail bar_i32 +; RV64I-NEXT: .LBB50_2: # %bb2 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: load_zext_nneg_sext_cse: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: addi sp, sp, -16 +; RV64ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64ZBB-NEXT: sd s0, 0(sp) # 8-byte Folded Spill +; RV64ZBB-NEXT: lhu s0, 0(a0) +; RV64ZBB-NEXT: sext.h a0, s0 +; RV64ZBB-NEXT: bltz a0, .LBB50_2 +; RV64ZBB-NEXT: # %bb.1: # %bb1 +; RV64ZBB-NEXT: call bar_i16 +; RV64ZBB-NEXT: mv a0, s0 +; RV64ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: addi sp, sp, 16 +; RV64ZBB-NEXT: tail bar_i32 +; RV64ZBB-NEXT: .LBB50_2: # %bb2 +; RV64ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64ZBB-NEXT: addi sp, sp, 16 +; RV64ZBB-NEXT: ret + %load = load i16, ptr %p + %zext = zext nneg i16 %load to i32 + %cmp = icmp sgt i16 %load, -1 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + tail call void @bar_i16(i16 signext %load) + tail call void @bar_i32(i32 signext %zext) + br label %bb2 + +bb2: + ret void +} +declare void @bar_i16(i16); From a51f4afc5aec8145091fead1d68c81e7d210fc0d Mon Sep 17 00:00:00 2001 From: Shimin Cui Date: Thu, 22 Feb 2024 12:04:08 -0500 Subject: [PATCH 234/351] [HCS] Externd to outline overlapping sub/super cold regions (#80732) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, with hot cold splitting, when a cold region is identified, it is added to the region list of ColdBlocks. Then when another cold region (B) identified overlaps with a ColdBlocks region (A) already added to the list, the region B is not added to the list because of the overlapping with region A. The splitting analysis is performed, and the region A may not get split, for example, if it’s considered too expansive. This is to improve the handling the overlapping case when the region A is not considered good for splitting, while the region B is good for splitting.   The change is to move the cold region splitting analysis earlier to allow more cold region splitting. If an identified region cannot be split, it will not be added to the candidate list of ColdBlocks for overlapping check. --- .../llvm/Transforms/IPO/HotColdSplitting.h | 15 +- llvm/lib/Transforms/IPO/HotColdSplitting.cpp | 154 ++++++++++-------- .../assumption-cache-invalidation.ll | 8 +- llvm/test/Transforms/HotColdSplit/eh-pads.ll | 7 +- .../HotColdSplit/outline-disjoint-diamonds.ll | 9 +- .../HotColdSplit/outline-inner-region.ll | 49 ++++++ .../HotColdSplit/outline-outer-region.ll | 52 ++++++ 7 files changed, 212 insertions(+), 82 deletions(-) create mode 100644 llvm/test/Transforms/HotColdSplit/outline-inner-region.ll create mode 100644 llvm/test/Transforms/HotColdSplit/outline-outer-region.ll diff --git a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h index c87c6453500c5..13dda6d61284c 100644 --- a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h +++ b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h @@ -24,6 +24,7 @@ class TargetTransformInfo; class OptimizationRemarkEmitter; class AssumptionCache; class DominatorTree; +class CodeExtractor; class CodeExtractorAnalysisCache; /// A sequence of basic blocks. @@ -43,19 +44,17 @@ class HotColdSplitting { private: bool isFunctionCold(const Function &F) const; - bool isBasicBlockCold(BasicBlock* BB, - BranchProbability ColdProbThresh, - SmallPtrSetImpl &ColdBlocks, + bool isBasicBlockCold(BasicBlock *BB, BranchProbability ColdProbThresh, SmallPtrSetImpl &AnnotatedColdBlocks, BlockFrequencyInfo *BFI) const; bool shouldOutlineFrom(const Function &F) const; bool outlineColdRegions(Function &F, bool HasProfileSummary); - Function *extractColdRegion(const BlockSequence &Region, + bool isSplittingBeneficial(CodeExtractor &CE, const BlockSequence &Region, + TargetTransformInfo &TTI); + Function *extractColdRegion(BasicBlock &EntryPoint, CodeExtractor &CE, const CodeExtractorAnalysisCache &CEAC, - DominatorTree &DT, BlockFrequencyInfo *BFI, - TargetTransformInfo &TTI, - OptimizationRemarkEmitter &ORE, - AssumptionCache *AC, unsigned Count); + BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE); ProfileSummaryInfo *PSI; function_ref GetBFI; function_ref GetTTI; diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index fabb3c5fb921d..5f03bd59b8cd1 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -215,15 +215,10 @@ bool HotColdSplitting::isFunctionCold(const Function &F) const { return false; } -bool HotColdSplitting::isBasicBlockCold(BasicBlock *BB, - BranchProbability ColdProbThresh, - SmallPtrSetImpl &ColdBlocks, - SmallPtrSetImpl &AnnotatedColdBlocks, - BlockFrequencyInfo *BFI) const { - // This block is already part of some outlining region. - if (ColdBlocks.count(BB)) - return true; - +bool HotColdSplitting::isBasicBlockCold( + BasicBlock *BB, BranchProbability ColdProbThresh, + SmallPtrSetImpl &AnnotatedColdBlocks, + BlockFrequencyInfo *BFI) const { if (BFI) { if (PSI->isColdBlock(BB, BFI)) return true; @@ -372,18 +367,12 @@ static int getOutliningPenalty(ArrayRef Region, return Penalty; } -Function *HotColdSplitting::extractColdRegion( - const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC, - DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, - OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) { +// Determine if it is beneficial to split the \p Region. +bool HotColdSplitting::isSplittingBeneficial(CodeExtractor &CE, + const BlockSequence &Region, + TargetTransformInfo &TTI) { assert(!Region.empty()); - // TODO: Pass BFI and BPI to update profile information. - CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr, - /* BPI */ nullptr, AC, /* AllowVarArgs */ false, - /* AllowAlloca */ false, /* AllocaBlock */ nullptr, - /* Suffix */ "cold." + std::to_string(Count)); - // Perform a simple cost/benefit analysis to decide whether or not to permit // splitting. SetVector Inputs, Outputs, Sinks; @@ -394,9 +383,18 @@ Function *HotColdSplitting::extractColdRegion( LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit << ", penalty = " << OutliningPenalty << "\n"); if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty) - return nullptr; + return false; + + return true; +} - Function *OrigF = Region[0]->getParent(); +// Split the single \p EntryPoint cold region. \p CE is the region code +// extractor. +Function *HotColdSplitting::extractColdRegion( + BasicBlock &EntryPoint, CodeExtractor &CE, + const CodeExtractorAnalysisCache &CEAC, BlockFrequencyInfo *BFI, + TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE) { + Function *OrigF = EntryPoint.getParent(); if (Function *OutF = CE.extractCodeRegion(CEAC)) { User *U = *OutF->user_begin(); CallInst *CI = cast(U); @@ -419,7 +417,7 @@ Function *HotColdSplitting::extractColdRegion( LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "HotColdSplit", - &*Region[0]->begin()) + &*EntryPoint.begin()) << ore::NV("Original", OrigF) << " split cold code into " << ore::NV("Split", OutF); }); @@ -428,9 +426,9 @@ Function *HotColdSplitting::extractColdRegion( ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", - &*Region[0]->begin()) + &*EntryPoint.begin()) << "Failed to extract region at block " - << ore::NV("Block", Region.front()); + << ore::NV("Block", &EntryPoint); }); return nullptr; } @@ -620,16 +618,18 @@ class OutliningRegion { } // namespace bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { - bool Changed = false; - - // The set of cold blocks. + // The set of cold blocks outlined. SmallPtrSet ColdBlocks; + // The set of cold blocks cannot be outlined. + SmallPtrSet CannotBeOutlinedColdBlocks; + // Set of cold blocks obtained with RPOT. SmallPtrSet AnnotatedColdBlocks; - // The worklist of non-intersecting regions left to outline. - SmallVector OutliningWorklist; + // The worklist of non-intersecting regions left to outline. The first member + // of the pair is the entry point into the region to be outlined. + SmallVector, 2> OutliningWorklist; // Set up an RPO traversal. Experimentally, this performs better (outlines // more) than a PO traversal, because we prevent region overlap by keeping @@ -655,10 +655,18 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { if (ColdBranchProbDenom.getNumOccurrences()) ColdProbThresh = BranchProbability(1, ColdBranchProbDenom.getValue()); + unsigned OutlinedFunctionID = 1; // Find all cold regions. for (BasicBlock *BB : RPOT) { - if (!isBasicBlockCold(BB, ColdProbThresh, ColdBlocks, AnnotatedColdBlocks, - BFI)) + // This block is already part of some outlining region. + if (ColdBlocks.count(BB)) + continue; + + // This block is already part of some region cannot be outlined. + if (CannotBeOutlinedColdBlocks.count(BB)) + continue; + + if (!isBasicBlockCold(BB, ColdProbThresh, AnnotatedColdBlocks, BFI)) continue; LLVM_DEBUG({ @@ -681,50 +689,68 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { return markFunctionCold(F); } - // If this outlining region intersects with another, drop the new region. - // - // TODO: It's theoretically possible to outline more by only keeping the - // largest region which contains a block, but the extra bookkeeping to do - // this is tricky/expensive. - bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) { - return !ColdBlocks.insert(Block.first).second; - }); - if (RegionsOverlap) - continue; + do { + BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT); + LLVM_DEBUG({ + dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; + for (BasicBlock *BB : SubRegion) + BB->dump(); + }); + + // TODO: Pass BFI and BPI to update profile information. + CodeExtractor CE( + SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr, + /* BPI */ nullptr, AC, /* AllowVarArgs */ false, + /* AllowAlloca */ false, /* AllocaBlock */ nullptr, + /* Suffix */ "cold." + std::to_string(OutlinedFunctionID)); + + if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) && + // If this outlining region intersects with another, drop the new + // region. + // + // TODO: It's theoretically possible to outline more by only keeping + // the largest region which contains a block, but the extra + // bookkeeping to do this is tricky/expensive. + none_of(SubRegion, [&](BasicBlock *Block) { + return ColdBlocks.contains(Block); + })) { + ColdBlocks.insert(SubRegion.begin(), SubRegion.end()); + + for (auto *Block : SubRegion) { + LLVM_DEBUG(dbgs() + << " contains cold block:" << Block->getName() << "\n"); + } + + OutliningWorklist.emplace_back( + std::make_pair(SubRegion[0], std::move(CE))); + ++OutlinedFunctionID; + } else { + // The cold block region cannot be outlined. + for (auto *Block : SubRegion) + if ((DT->dominates(BB, Block) && PDT->dominates(Block, BB)) || + (PDT->dominates(BB, Block) && DT->dominates(Block, BB))) + // Will skip this cold block in the loop to save the compile time + CannotBeOutlinedColdBlocks.insert(Block); + } + } while (!Region.empty()); - OutliningWorklist.emplace_back(std::move(Region)); ++NumColdRegionsFound; } } if (OutliningWorklist.empty()) - return Changed; + return false; // Outline single-entry cold regions, splitting up larger regions as needed. - unsigned OutlinedFunctionID = 1; // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. CodeExtractorAnalysisCache CEAC(F); - do { - OutliningRegion Region = OutliningWorklist.pop_back_val(); - assert(!Region.empty() && "Empty outlining region in worklist"); - do { - BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT); - LLVM_DEBUG({ - dbgs() << "Hot/cold splitting attempting to outline these blocks:\n"; - for (BasicBlock *BB : SubRegion) - BB->dump(); - }); - - Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI, - ORE, AC, OutlinedFunctionID); - if (Outlined) { - ++OutlinedFunctionID; - Changed = true; - } - } while (!Region.empty()); - } while (!OutliningWorklist.empty()); + for (auto &BCE : OutliningWorklist) { + Function *Outlined = + extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE); + assert(Outlined && "Should be outlined"); + } - return Changed; + return true; } bool HotColdSplitting::run(Module &M) { diff --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll index 2154fb5cb5bc1..8bc71148352d2 100644 --- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll +++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll @@ -13,13 +13,13 @@ target triple = "aarch64" ; CHECK-NOT: @llvm.assume ; CHECK: } ; CHECK: declare {{.*}}@llvm.assume -; CHECK: define {{.*}}@f.cold.1() -; CHECK-LABEL: newFuncRoot: -; CHECK: } -; CHECK: define {{.*}}@f.cold.2(i64 %load1) +; CHECK: define {{.*}}@f.cold.1(i64 %load1) ; CHECK-LABEL: newFuncRoot: ; CHECK: %cmp1 = icmp eq i64 %load1, 0 ; CHECK-NOT: call void @llvm.assume +; CHECK: define {{.*}}@f.cold.2() +; CHECK-LABEL: newFuncRoot: +; CHECK: } define void @f() { entry: diff --git a/llvm/test/Transforms/HotColdSplit/eh-pads.ll b/llvm/test/Transforms/HotColdSplit/eh-pads.ll index 415c7e4b2bde3..ad7baf97f68d0 100644 --- a/llvm/test/Transforms/HotColdSplit/eh-pads.ll +++ b/llvm/test/Transforms/HotColdSplit/eh-pads.ll @@ -84,13 +84,16 @@ cold4: ; CHECK: sink ; CHECK-LABEL: define {{.*}}@bar.cold.1( +; CHECK: sideeffect(i32 0) + +; CHECK-LABEL: define {{.*}}@bar.cold.2( ; CHECK: sideeffect(i32 1) ; CHECK-LABEL: define {{.*}}@baz.cold.1( -; CHECK: sideeffect(i32 1) +; CHECK: sideeffect(i32 0) ; CHECK-LABEL: define {{.*}}@baz.cold.2( -; CHECK: sideeffect(i32 0) +; CHECK: sideeffect(i32 1) declare void @sideeffect(i32) diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll index 65f8aad424066..0c055981260b2 100644 --- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll +++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll @@ -1,10 +1,10 @@ ; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s ; CHECK-LABEL: define {{.*}}@fun -; CHECK: call {{.*}}@fun.cold.2( -; CHECK-NEXT: ret void ; CHECK: call {{.*}}@fun.cold.1( ; CHECK-NEXT: ret void +; CHECK: call {{.*}}@fun.cold.2( +; CHECK-NEXT: ret void define void @fun() { entry: br i1 undef, label %A.then, label %A.else @@ -49,9 +49,10 @@ B.cleanup: } ; CHECK-LABEL: define {{.*}}@fun.cold.1( -; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ] +; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ] ; CHECK-NEXT: unreachable ; CHECK-LABEL: define {{.*}}@fun.cold.2( -; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ] +; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ] ; CHECK-NEXT: unreachable + diff --git a/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll b/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll new file mode 100644 index 0000000000000..73398bf365ff0 --- /dev/null +++ b/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-max-params=1 < %s | FileCheck %s + +target datalayout = "E-m:a-p:32:32-i64:64-n32" +target triple = "powerpc64-ibm-aix7.2.0.0" + +define void @foo(i32 %cond) { +; CHECK-LABEL: define {{.*}}@foo( +; CHECK: if.then: +; CHECK: br i1 {{.*}}, label %if.then1, label %codeRepl +; CHECK-LABEL: codeRepl: +; CHECK-NEXT: call void @foo.cold.1 +; +entry: + %cond.addr = alloca i32 + store i32 %cond, ptr %cond.addr + %0 = load i32, ptr %cond.addr + %tobool = icmp ne i32 %0, 0 + br i1 %tobool, label %if.then, label %if.end2 + +if.then: ; preds = %entry + %1 = load i32, ptr %cond.addr + call void @sink(i32 %0) + %cmp = icmp sgt i32 %1, 10 + br i1 %cmp, label %if.then1, label %if.else + +if.then1: ; preds = %if.then + call void @sideeffect(i32 2) + br label %if.end + +if.else: ; preds = %if.then + call void @sink(i32 0) + call void @sideeffect(i32 0) + br label %if.end + +if.end: ; preds = %if.else, %if.then1 + br label %if.end2 + +if.end2: ; preds = %entry + call void @sideeffect(i32 1) + ret void +} + +; CHECK-LABEL: define {{.*}}@foo.cold.1 +; CHECK: call {{.*}}@sink +; CHECK-NEXT: call {{.*}}@sideeffect + +declare void @sideeffect(i32) + +declare void @sink(i32) cold diff --git a/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll b/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll new file mode 100644 index 0000000000000..4a3c96982a87b --- /dev/null +++ b/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll @@ -0,0 +1,52 @@ +; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=2 < %s | FileCheck %s + +target datalayout = "E-m:a-p:32:32-i64:64-n32" +target triple = "powerpc64-ibm-aix7.2.0.0" + +define void @foo(i32 %cond, i32 %s0, i32 %s1) { +; CHECK-LABEL: define {{.*}}@foo( +; CHECK: br i1 {{.*}}, label %codeRepl, label %if.end2 +; CHECK-LABEL: codeRepl: +; CHECK-NEXT: call void @foo.cold.1 +; CHECK-LABEL: if.end2: +; CHECK: call void @sideeffect +; +entry: + %cond.addr = alloca i32 + store i32 %cond, ptr %cond.addr + %0 = load i32, ptr %cond.addr + %tobool = icmp ne i32 %0, 0 + br i1 %tobool, label %if.then, label %if.end2 + +if.then: ; preds = %entry + %1 = load i32, ptr %cond.addr + %cmp = icmp sgt i32 %1, 10 + br i1 %cmp, label %if.then1, label %if.else + +if.then1: ; preds = %if.then + call void @sideeffect(i32 0) + br label %if.end + +if.else: ; preds = %if.then + call void @sink(i32 %s0) + call void @sideeffect(i32 1) + br label %if.end + +if.end: ; preds = %if.else, %if.then1 + call void @sink(i32 %0) + ret void + +if.end2: ; preds = %entry + call void @sideeffect(i32 %s1) + ret void +} + +; CHECK-LABEL: define {{.*}}@foo.cold.1 +; CHECK: call {{.*}}@sink +; CHECK: call {{.*}}@sideeffect +; CHECK: call {{.*}}@sideeffect +; CHECK: call {{.*}}@sink + +declare void @sideeffect(i32) + +declare void @sink(i32) cold From c1716e3fcf4e43b4a328731920f76b2fce9485d0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 22 Feb 2024 09:06:49 -0800 Subject: [PATCH 235/351] [DAGCombiner][RISCV] CSE zext nneg and sext. (#82597) If we have a sext and a zext nneg with the same types and operand we should combine them into the sext. We can't go the other way because the nneg flag may only be valid in the context of the uses of the zext nneg. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 ++ llvm/test/CodeGen/RISCV/sext-zext-trunc.ll | 69 +++++++------------ 2 files changed, 30 insertions(+), 46 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 89ef648ee7d7e..ed43dd7f52882 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13997,6 +13997,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) return Res; + // CSE zext nneg with sext if the zext is not free. + if (N->getFlags().hasNonNeg() && !TLI.isZExtFree(N0.getValueType(), VT)) { + SDNode *CSENode = DAG.getNodeIfExists(ISD::SIGN_EXTEND, N->getVTList(), N0); + if (CSENode) + return SDValue(CSENode, 0); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll index 09516d91771ca..87f2a6306bd60 100644 --- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll +++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll @@ -882,11 +882,10 @@ define void @load_zext_nneg_sext_cse(ptr %p) nounwind { ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu s0, 0(a0) -; RV32I-NEXT: slli a0, s0, 16 -; RV32I-NEXT: bltz a0, .LBB50_2 +; RV32I-NEXT: lh s0, 0(a0) +; RV32I-NEXT: bltz s0, .LBB50_2 ; RV32I-NEXT: # %bb.1: # %bb1 -; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call bar_i16 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -899,48 +898,26 @@ define void @load_zext_nneg_sext_cse(ptr %p) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV64I-LABEL: load_zext_nneg_sext_cse: -; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu s0, 0(a0) -; RV64I-NEXT: slli a0, s0, 48 -; RV64I-NEXT: bltz a0, .LBB50_2 -; RV64I-NEXT: # %bb.1: # %bb1 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: call bar_i16 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 -; RV64I-NEXT: tail bar_i32 -; RV64I-NEXT: .LBB50_2: # %bb2 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 -; RV64I-NEXT: ret -; -; RV64ZBB-LABEL: load_zext_nneg_sext_cse: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: addi sp, sp, -16 -; RV64ZBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ZBB-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64ZBB-NEXT: lhu s0, 0(a0) -; RV64ZBB-NEXT: sext.h a0, s0 -; RV64ZBB-NEXT: bltz a0, .LBB50_2 -; RV64ZBB-NEXT: # %bb.1: # %bb1 -; RV64ZBB-NEXT: call bar_i16 -; RV64ZBB-NEXT: mv a0, s0 -; RV64ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64ZBB-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64ZBB-NEXT: addi sp, sp, 16 -; RV64ZBB-NEXT: tail bar_i32 -; RV64ZBB-NEXT: .LBB50_2: # %bb2 -; RV64ZBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64ZBB-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64ZBB-NEXT: addi sp, sp, 16 -; RV64ZBB-NEXT: ret +; RV64-LABEL: load_zext_nneg_sext_cse: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: lh s0, 0(a0) +; RV64-NEXT: bltz s0, .LBB50_2 +; RV64-NEXT: # %bb.1: # %bb1 +; RV64-NEXT: mv a0, s0 +; RV64-NEXT: call bar_i16 +; RV64-NEXT: mv a0, s0 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: tail bar_i32 +; RV64-NEXT: .LBB50_2: # %bb2 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret %load = load i16, ptr %p %zext = zext nneg i16 %load to i32 %cmp = icmp sgt i16 %load, -1 From 5b53fa04db33a931b843b32946065490513484bf Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 22 Feb 2024 09:07:21 -0800 Subject: [PATCH 236/351] [RISCV] Enable -riscv-enable-sink-fold by default. (#82026) AArch64 has had it enabled since late November, so hopefully the main issues have been resolved. I see a small reduction in dynamic instruction count on every benchmark in specint2017. The best improvement was 0.3% so nothing amazing. --- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 +- llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll | 8 ++++---- .../test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll | 8 ++++---- llvm/test/CodeGen/RISCV/split-offsets.ll | 4 ++-- llvm/test/CodeGen/RISCV/srem-vector-lkk.ll | 8 ++++---- llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 8 ++++---- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index adef40e19cba4..3e20e451410f6 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -84,7 +84,7 @@ static cl::opt EnableRISCVDeadRegisterElimination( static cl::opt EnableSinkFold("riscv-enable-sink-fold", cl::desc("Enable sinking and folding of instruction copies"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden, diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll index 91e73992bdfa3..3c2e84689c979 100644 --- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ -; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I %s +; RUN: | FileCheck -check-prefix=RV32I %s ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -code-model=medium < %s \ -; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I-MEDIUM %s +; RUN: | FileCheck -check-prefix=RV32I-MEDIUM %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ -; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I %s +; RUN: | FileCheck -check-prefix=RV64I %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -code-model=medium < %s \ -; RUN: -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I-MEDIUM %s +; RUN: | FileCheck -check-prefix=RV64I-MEDIUM %s ; We can often fold an ADDI into the offset of load/store instructions: ; (load (addi base, off1), off2) -> (load base, off1+off2) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 88c299a19fb4e..a09ab3ee0252a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ -; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ -; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \ -; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \ -; RUN: -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F declare <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i8>) diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index fc35bc4d2a16d..8d065daa2067c 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefix=RV32I -; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefix=RV64I ; Check that memory accesses to array elements with large offsets have those diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index ec6e978c2c68e..7fc4713ac2d6e 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV32I %s -; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV32IM %s -; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64IM %s define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index eea8e64f2dddb..540883fdc517a 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s -; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s -; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s -; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s From 26cc6f126a3b25644c595b3a5a0417b1e1ab42a8 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Thu, 22 Feb 2024 09:09:08 -0800 Subject: [PATCH 237/351] [OpenACC] Implement 'break' and 'continue' errors for Compute Cnstrcts (#82543) OpenACC3.3 2.5.4 says: "A program may not branch into or out of a compute construct". While some of this restriction isn't particularly checkable, 'break' and 'continue' are possible and pretty trivial, so this patch implements those limitations. It IS unclear in the case of a 'break' in a 'switch' what should happen (an antagonistic reading of the standard would prevent it from appearing), however we're choosing to special-case the break-in-switch to ensure that this works (albeit, a 'parallel' directive on a 'switch' isn't particularly useful, though permitted). Future implementations of this rule will be in a follow-up patch. --- .../clang/Basic/DiagnosticSemaKinds.td | 2 + clang/include/clang/Sema/Scope.h | 17 ++++ clang/lib/Parse/ParseOpenACC.cpp | 17 ++++ clang/lib/Sema/Scope.cpp | 1 + clang/lib/Sema/SemaStmt.cpp | 22 +++++ clang/test/SemaOpenACC/no-branch-in-out.c | 95 +++++++++++++++++++ 6 files changed, 154 insertions(+) create mode 100644 clang/test/SemaOpenACC/no-branch-in-out.c diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a96f69d6ac760..ebda201361fb0 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12203,4 +12203,6 @@ def warn_acc_clause_unimplemented def err_acc_construct_appertainment : Error<"OpenACC construct '%0' cannot be used here; it can only " "be used in a statement context">; +def err_acc_branch_in_out + : Error<"invalid branch %select{out of|into}0 OpenACC region">; } // end of sema component. diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h index 9e81706cd2aa1..e7f166fe3461f 100644 --- a/clang/include/clang/Sema/Scope.h +++ b/clang/include/clang/Sema/Scope.h @@ -150,6 +150,9 @@ class Scope { /// template scope in between), the outer scope does not increase the /// depth of recursion. LambdaScope = 0x8000000, + /// This is the scope of an OpenACC Compute Construct, which restricts + /// jumping into/out of it. + OpenACCComputeConstructScope = 0x10000000, }; private: @@ -469,6 +472,14 @@ class Scope { return false; } + /// Return true if this scope is a loop. + bool isLoopScope() const { + // 'switch' is the only loop that is not a 'break' scope as well, so we can + // just check BreakScope and not SwitchScope. + return (getFlags() & Scope::BreakScope) && + !(getFlags() & Scope::SwitchScope); + } + /// Determines whether this scope is the OpenMP directive scope bool isOpenMPDirectiveScope() const { return (getFlags() & Scope::OpenMPDirectiveScope); @@ -504,6 +515,12 @@ class Scope { return getFlags() & Scope::OpenMPOrderClauseScope; } + /// Determine whether this scope is the statement associated with an OpenACC + /// Compute construct directive. + bool isOpenACCComputeConstructScope() const { + return getFlags() & Scope::OpenACCComputeConstructScope; + } + /// Determine whether this scope is a while/do/for statement, which can have /// continue statements embedded into it. bool isContinueScope() const { diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 50e78e8687aea..4946a61fca007 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -560,6 +560,21 @@ bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) { llvm_unreachable("Unhandled directive->assoc stmt"); } +unsigned getOpenACCScopeFlags(OpenACCDirectiveKind DirKind) { + switch (DirKind) { + case OpenACCDirectiveKind::Parallel: + // Mark this as a BreakScope/ContinueScope as well as a compute construct + // so that we can diagnose trying to 'break'/'continue' inside of one. + return Scope::BreakScope | Scope::ContinueScope | + Scope::OpenACCComputeConstructScope; + case OpenACCDirectiveKind::Invalid: + llvm_unreachable("Shouldn't be creating a scope for an invalid construct"); + default: + break; + } + return 0; +} + } // namespace // OpenACC 3.3, section 1.7: @@ -1228,6 +1243,8 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() { if (doesDirectiveHaveAssociatedStmt(DirInfo.DirKind)) { ParsingOpenACCDirectiveRAII DirScope(*this, /*Value=*/false); + ParseScope ACCScope(this, getOpenACCScopeFlags(DirInfo.DirKind)); + AssocStmt = getActions().ActOnOpenACCAssociatedStmt(DirInfo.DirKind, ParseStatement()); } diff --git a/clang/lib/Sema/Scope.cpp b/clang/lib/Sema/Scope.cpp index 4570d8c615fe5..cea6a62e34747 100644 --- a/clang/lib/Sema/Scope.cpp +++ b/clang/lib/Sema/Scope.cpp @@ -225,6 +225,7 @@ void Scope::dumpImpl(raw_ostream &OS) const { {CompoundStmtScope, "CompoundStmtScope"}, {ClassInheritanceScope, "ClassInheritanceScope"}, {CatchScope, "CatchScope"}, + {OpenACCComputeConstructScope, "OpenACCComputeConstructScope"}, }; for (auto Info : FlagInfo) { diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index dde3bd84e89f8..fcad09a63662b 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3356,6 +3356,14 @@ Sema::ActOnContinueStmt(SourceLocation ContinueLoc, Scope *CurScope) { // initialization of that variable. return StmtError(Diag(ContinueLoc, diag::err_continue_from_cond_var_init)); } + + // A 'continue' that would normally have execution continue on a block outside + // of a compute construct counts as 'branching out of' the compute construct, + // so diagnose here. + if (S->isOpenACCComputeConstructScope()) + return StmtError(Diag(ContinueLoc, diag::err_acc_branch_in_out) + << /*out of */ 0); + CheckJumpOutOfSEHFinally(*this, ContinueLoc, *S); return new (Context) ContinueStmt(ContinueLoc); @@ -3371,6 +3379,20 @@ Sema::ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope) { if (S->isOpenMPLoopScope()) return StmtError(Diag(BreakLoc, diag::err_omp_loop_cannot_use_stmt) << "break"); + + // OpenACC doesn't allow 'break'ing from a compute construct, so diagnose if + // we are trying to do so. This can come in 2 flavors: 1-the break'able thing + // (besides the compute construct) 'contains' the compute construct, at which + // point the 'break' scope will be the compute construct. Else it could be a + // loop of some sort that has a direct parent of the compute construct. + // However, a 'break' in a 'switch' marked as a compute construct doesn't + // count as 'branch out of' the compute construct. + if (S->isOpenACCComputeConstructScope() || + (S->isLoopScope() && S->getParent() && + S->getParent()->isOpenACCComputeConstructScope())) + return StmtError(Diag(BreakLoc, diag::err_acc_branch_in_out) + << /*out of */ 0); + CheckJumpOutOfSEHFinally(*this, BreakLoc, *S); return new (Context) BreakStmt(BreakLoc); diff --git a/clang/test/SemaOpenACC/no-branch-in-out.c b/clang/test/SemaOpenACC/no-branch-in-out.c new file mode 100644 index 0000000000000..622cf55f48473 --- /dev/null +++ b/clang/test/SemaOpenACC/no-branch-in-out.c @@ -0,0 +1,95 @@ +// RUN: %clang_cc1 %s -verify -fopenacc + +void BreakContinue() { + +#pragma acc parallel + for(int i =0; i < 5; ++i) { + switch(i) { + case 0: + break; // leaves switch, not 'for'. + default: + i +=2; + break; + } + if (i == 2) + continue; + + break; // expected-error{{invalid branch out of OpenACC region}} + } + + int j; + switch(j) { + case 0: +#pragma acc parallel + { + break; // expected-error{{invalid branch out of OpenACC region}} + } + case 1: +#pragma acc parallel + { + } + break; + } + +#pragma acc parallel + for(int i = 0; i < 5; ++i) { + if (i > 1) + break; // expected-error{{invalid branch out of OpenACC region}} + } + +#pragma acc parallel + switch(j) { + case 1: + break; + } + +#pragma acc parallel + { + for(int i = 1; i < 100; i++) { + if (i > 4) + break; + } + } + + for (int i =0; i < 5; ++i) { +#pragma acc parallel + { + continue; // expected-error{{invalid branch out of OpenACC region}} + } + } + +#pragma acc parallel + for (int i =0; i < 5; ++i) { + continue; + } + +#pragma acc parallel + for (int i =0; i < 5; ++i) { + { + continue; + } + } + + for (int i =0; i < 5; ++i) { +#pragma acc parallel + { + break; // expected-error{{invalid branch out of OpenACC region}} + } + } + +#pragma acc parallel + while (j) { + --j; + if (j > 4) + break; // expected-error{{invalid branch out of OpenACC region}} + } + +#pragma acc parallel + do { + --j; + if (j > 4) + break; // expected-error{{invalid branch out of OpenACC region}} + } while (j ); + +} + From 87b1e735b28f81d9012fd302cd07385db50a274f Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 23 Feb 2024 01:16:39 +0800 Subject: [PATCH 238/351] [ConstraintElim] Decompose sext-like insts for signed predicates (#82344) Alive2: https://alive2.llvm.org/ce/z/A8dtGp Fixes #82271. --- .../Scalar/ConstraintElimination.cpp | 13 ++- .../ConstraintElimination/minmax.ll | 9 +- .../Transforms/ConstraintElimination/sext.ll | 84 +++++++++++++------ 3 files changed, 71 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index db05c63f388fb..9b6a39e98f5ce 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -499,6 +499,8 @@ static Decomposition decompose(Value *V, if (!Ty->isIntegerTy() || Ty->getIntegerBitWidth() > 64) return V; + bool IsKnownNonNegative = false; + // Decompose \p V used with a signed predicate. if (IsSigned) { if (auto *CI = dyn_cast(V)) { @@ -507,6 +509,14 @@ static Decomposition decompose(Value *V, } Value *Op0; Value *Op1; + + if (match(V, m_SExt(m_Value(Op0)))) + V = Op0; + else if (match(V, m_NNegZExt(m_Value(Op0)))) { + V = Op0; + IsKnownNonNegative = true; + } + if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) return MergeResults(Op0, Op1, IsSigned); @@ -529,7 +539,7 @@ static Decomposition decompose(Value *V, } } - return V; + return {V, IsKnownNonNegative}; } if (auto *CI = dyn_cast(V)) { @@ -539,7 +549,6 @@ static Decomposition decompose(Value *V, } Value *Op0; - bool IsKnownNonNegative = false; if (match(V, m_ZExt(m_Value(Op0)))) { IsKnownNonNegative = true; V = Op0; diff --git a/llvm/test/Transforms/ConstraintElimination/minmax.ll b/llvm/test/Transforms/ConstraintElimination/minmax.ll index ab3e9f381245b..029b6508a2106 100644 --- a/llvm/test/Transforms/ConstraintElimination/minmax.ll +++ b/llvm/test/Transforms/ConstraintElimination/minmax.ll @@ -611,8 +611,7 @@ define i64 @pr82271(i32 %a, i32 %b){ ; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A]] to i64 ; CHECK-NEXT: [[SB:%.*]] = sext i32 [[B]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]]) -; CHECK-NEXT: ret i64 [[SMAX]] +; CHECK-NEXT: ret i64 [[SB]] ; CHECK: else: ; CHECK-NEXT: ret i64 0 ; @@ -641,8 +640,7 @@ define i64 @pr82271_sext_zext_nneg(i32 %a, i32 %b){ ; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A]] to i64 ; CHECK-NEXT: [[SB:%.*]] = zext nneg i32 [[B]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]]) -; CHECK-NEXT: ret i64 [[SMAX]] +; CHECK-NEXT: ret i64 [[SB]] ; CHECK: else: ; CHECK-NEXT: ret i64 0 ; @@ -671,8 +669,7 @@ define i64 @pr82271_zext_nneg(i32 %a, i32 %b){ ; CHECK-NEXT: [[SA:%.*]] = zext nneg i32 [[A]] to i64 ; CHECK-NEXT: [[SB:%.*]] = zext nneg i32 [[B]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]]) -; CHECK-NEXT: ret i64 [[SMAX]] +; CHECK-NEXT: ret i64 [[SB]] ; CHECK: else: ; CHECK-NEXT: ret i64 0 ; diff --git a/llvm/test/Transforms/ConstraintElimination/sext.ll b/llvm/test/Transforms/ConstraintElimination/sext.ll index ed8dd502b6ef9..5a8a37d0d5703 100644 --- a/llvm/test/Transforms/ConstraintElimination/sext.ll +++ b/llvm/test/Transforms/ConstraintElimination/sext.ll @@ -11,8 +11,7 @@ define i1 @cmp_sext(i32 %a, i32 %b){ ; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A]] to i64 ; CHECK-NEXT: [[SB:%.*]] = sext i32 [[B]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]] -; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK-NEXT: ret i1 true ; CHECK: else: ; CHECK-NEXT: ret i1 false ; @@ -31,33 +30,32 @@ else: ret i1 false } -define i1 @cmp_sext_positive_increment(i32 %a, i32 %b, i64 %c){ -; CHECK-LABEL: define i1 @cmp_sext_positive_increment( -; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) { +define i1 @cmp_sext_add(i32 %a, i32 %b){ +; CHECK-LABEL: define i1 @cmp_sext_add( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[POS:%.*]] = icmp sgt i64 [[C]], 0 -; CHECK-NEXT: call void @llvm.assume(i1 [[POS]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] ; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: -; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A]] to i64 -; CHECK-NEXT: [[SB:%.*]] = sext i32 [[B]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], [[C]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]] -; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK-NEXT: [[A1:%.*]] = add nsw i32 [[A]], 1 +; CHECK-NEXT: [[B1:%.*]] = add nsw i32 [[B]], 1 +; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A1]] to i64 +; CHECK-NEXT: [[SB:%.*]] = sext i32 [[B1]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 +; CHECK-NEXT: ret i1 true ; CHECK: else: ; CHECK-NEXT: ret i1 false ; entry: - %pos = icmp sgt i64 %c, 0 - call void @llvm.assume(i1 %pos) %cmp = icmp slt i32 %a, %b br i1 %cmp, label %then, label %else then: - %sa = sext i32 %a to i64 - %sb = sext i32 %b to i64 - %add = add nsw i64 %sa, %c + %a1 = add nsw i32 %a, 1 + %b1 = add nsw i32 %b, 1 + %sa = sext i32 %a1 to i64 + %sb = sext i32 %b1 to i64 + %add = add nsw i64 %sa, 1 %cmp2 = icmp sge i64 %sb, %add ret i1 %cmp2 @@ -65,30 +63,33 @@ else: ret i1 false } -define i1 @cmp_sext_sgt(i32 %a, i32 %b){ -; CHECK-LABEL: define i1 @cmp_sext_sgt( -; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +define i1 @cmp_sext_dynamic_increment(i32 %a, i32 %b, i64 %c){ +; CHECK-LABEL: define i1 @cmp_sext_dynamic_increment( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[POS:%.*]] = icmp slt i64 [[C]], 2 +; CHECK-NEXT: call void @llvm.assume(i1 [[POS]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] ; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A]] to i64 ; CHECK-NEXT: [[SB:%.*]] = sext i32 [[B]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[SB]], [[ADD]] -; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], [[C]] +; CHECK-NEXT: ret i1 true ; CHECK: else: ; CHECK-NEXT: ret i1 false ; entry: + %pos = icmp slt i64 %c, 2 + call void @llvm.assume(i1 %pos) %cmp = icmp slt i32 %a, %b br i1 %cmp, label %then, label %else then: %sa = sext i32 %a to i64 %sb = sext i32 %b to i64 - %add = add nsw i64 %sa, 1 - %cmp2 = icmp sgt i64 %sb, %add + %add = add nsw i64 %sa, %c + %cmp2 = icmp sge i64 %sb, %add ret i1 %cmp2 else: @@ -105,8 +106,7 @@ define i1 @cmp_zext_nneg(i32 %a, i32 %b){ ; CHECK-NEXT: [[SA:%.*]] = zext nneg i32 [[A]] to i64 ; CHECK-NEXT: [[SB:%.*]] = zext nneg i32 [[B]] to i64 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]] -; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK-NEXT: ret i1 true ; CHECK: else: ; CHECK-NEXT: ret i1 false ; @@ -216,3 +216,33 @@ then: else: ret i1 false } + +define i1 @cmp_sext_sgt(i32 %a, i32 %b){ +; CHECK-LABEL: define i1 @cmp_sext_sgt( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[SA:%.*]] = sext i32 [[A]] to i64 +; CHECK-NEXT: [[SB:%.*]] = sext i32 [[B]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[SA]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[SB]], [[ADD]] +; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK: else: +; CHECK-NEXT: ret i1 false +; +entry: + %cmp = icmp slt i32 %a, %b + br i1 %cmp, label %then, label %else + +then: + %sa = sext i32 %a to i64 + %sb = sext i32 %b to i64 + %add = add nsw i64 %sa, 1 + %cmp2 = icmp sgt i64 %sb, %add + ret i1 %cmp2 + +else: + ret i1 false +} From 26d71d9ed56c4c23e6284dac7a9bdf603a5801f3 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 22 Feb 2024 09:24:21 -0800 Subject: [PATCH 239/351] [llvm-readobj,ELF] Support --decompress/-z (#82594) When a section has the SHF_COMPRESSED flag, -p/-x dump the compressed content by default. In GNU readelf, if --decompress/-z is specified, -p/-x will dump the decompressed content. This patch implements the option. Close #82507 --- llvm/docs/CommandGuide/llvm-readelf.rst | 5 ++ llvm/docs/CommandGuide/llvm-readobj.rst | 5 ++ .../ELF/decompress-zlib-unsupported.test | 32 ++++++++ .../llvm-readobj/ELF/decompress-zlib.test | 76 +++++++++++++++++++ .../ELF/decompress-zstd-unsupported.test | 31 ++++++++ .../llvm-readobj/ELF/decompress-zstd.test | 28 +++++++ llvm/tools/llvm-readobj/ObjDumper.cpp | 26 ++++++- llvm/tools/llvm-readobj/ObjDumper.h | 4 +- llvm/tools/llvm-readobj/Opts.td | 2 + llvm/tools/llvm-readobj/llvm-readobj.cpp | 6 +- 10 files changed, 209 insertions(+), 6 deletions(-) create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst index 6ee4a5dfb1591..675628fdda45e 100644 --- a/llvm/docs/CommandGuide/llvm-readelf.rst +++ b/llvm/docs/CommandGuide/llvm-readelf.rst @@ -38,6 +38,11 @@ OPTIONS Display the contents of the basic block address map section(s), which contain the address of each function, along with the relative offset of each basic block. +.. option:: --decompress, -z + + Dump decompressed section content when used with ``-x`` or ``-p``. + If the section(s) are not compressed, they are displayed as is. + .. option:: --demangle, -C Display demangled symbol names in the output. diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst index cb9232ef5e560..6d78a03872344 100644 --- a/llvm/docs/CommandGuide/llvm-readobj.rst +++ b/llvm/docs/CommandGuide/llvm-readobj.rst @@ -56,6 +56,11 @@ file formats. Display the address-significance table. +.. option:: --decompress, -z + + Dump decompressed section content when used with ``-x`` or ``-p``. + If the section(s) are not compressed, they are displayed as is. + .. option:: --expand-relocs When used with :option:`--relocs`, display each relocation in an expanded diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test new file mode 100644 index 0000000000000..f4c73de7ca6c9 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test @@ -0,0 +1,32 @@ +# UNSUPPORTED: zlib +# RUN: yaml2obj %s -o %t +# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t + +# CHECK: String dump of section '.a': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time +# CHECK-NEXT: [ 0] . +# CHECK-NEXT: [ 8] . +# CHECK-NEXT: [ 10] . +# CHECK-NEXT: [ 18] x.c. +# CHECK-NEXT: [ 1e] . +# CHECK-NEXT: [ 20] . +# CHECK-NEXT: Hex dump of section '.b': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time +# CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................ +# CHECK-NEXT: 0x00000010 01000000 00000000 789c6304 00000200 ........x.c..... +# CHECK-NEXT: 0x00000020 02 . + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .a + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 010000000000000001000000000000000100000000000000789c63040000020002 + - Name: .b + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 010000000000000001000000000000000100000000000000789c63040000020002 diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test new file mode 100644 index 0000000000000..ea7a8854eb1a0 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test @@ -0,0 +1,76 @@ +# REQUIRES: zlib +## Test --decompress/-z. + +# RUN: yaml2obj %s -o %t + +# RUN: llvm-readelf -z -x .strings -x .not_null_terminated %t | FileCheck %s --check-prefix=HEX +# RUN: llvm-readobj --decompress -p .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=STR + +# HEX: Hex dump of section '.strings': +# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st +# HEX-NEXT: 0x00000010 72696e67 7300 rings. +# HEX: Hex dump of section '.not_null_terminated': +# HEX-NEXT: 0x00000000 6e6f006e 756c6c no.null + +# STR: String dump of section '.strings': +# STR-NEXT: [ 0] here +# STR-NEXT: [ 5] are +# STR-NEXT: [ 9] some +# STR-NEXT: [ e] strings +# STR-EMPTY: +# STR-NEXT: String dump of section '.not_null_terminated': +# STR-NEXT: [ 0] no +# STR-NEXT: [ 3] null{{$}} +# STR-NOT: {{.}} + +# RUN: llvm-readobj -x .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=COMPRESSED + +# COMPRESSED: String dump of section '.not_null_terminated': +# COMPRESSED-NEXT: [ 0] no +# COMPRESSED-NEXT: [ 3] null +# COMPRESSED-NEXT: Hex dump of section '.strings': +# COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................ +# COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH +# COMPRESSED-NEXT: 0x00000020 04e2e2fc 5c205152 9499975e cc000058 ....\ QR...^...X +# COMPRESSED-NEXT: 0x00000030 2e079b ... + +# RUN: llvm-readelf -z -p .invalid1 -x .invalid2 -x .invalid3 %t 2>&1 | FileCheck %s -DFILE=%t --check-prefix=INVALID + +# INVALID: String dump of section '.invalid1': +# INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header +# INVALID-NEXT: [ 0] . +# INVALID-NEXT: Hex dump of section '.invalid2': +# INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR +# INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................ +# INVALID-NEXT: 0x00000010 00000000 00000000 78 ........x +# INVALID-EMPTY: +# INVALID-NEXT: Hex dump of section '.invalid3': +# INVALID-NEXT: warning: '[[FILE]]': unsupported compression type (3) +# INVALID-NEXT: 0x00000000 03000000 00000000 04000000 00000000 ................ +# INVALID-NEXT: 0x00000010 00000000 00000000 789c6360 ........x.c` + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .strings + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 010000000000000016000000000000000000000000000000789ccb482d4a654804e2e2fc5c2051529499975ecc0000582e079b + - Name: .not_null_terminated + Type: SHT_PROGBITS + Content: 6e6f006e756c6c + - Name: .invalid1 + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 01 + - Name: .invalid2 + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 01000000000000001600000000000000000000000000000078 + - Name: .invalid3 + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 030000000000000004000000000000000000000000000000789c6360 diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test new file mode 100644 index 0000000000000..65da952687f52 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test @@ -0,0 +1,31 @@ +# UNSUPPORTED: zstd +# RUN: yaml2obj %s -o %t +# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t + +# CHECK: String dump of section '.a': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time +# CHECK-NEXT: [ 0] . +# CHECK-NEXT: [ 8] . +# CHECK-NEXT: [ 10] . +# CHECK-NEXT: [ 18] (./. .. +# CHECK-NEXT: [ 21] . +# CHECK-NEXT: Hex dump of section '.b': +# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time +# CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................ +# CHECK-NEXT: 0x00000010 01000000 00000000 28b52ffd 20010900 ........(./. ... +# CHECK-NEXT: 0x00000020 0001 .. + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .a + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001 + - Name: .b + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001 diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test new file mode 100644 index 0000000000000..519db879b18c1 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test @@ -0,0 +1,28 @@ +# REQUIRES: zstd +## Test --decompress/-z for zstd. + +# RUN: yaml2obj %s -o %t + +# RUN: llvm-readelf -z -x .strings %t | FileCheck %s --check-prefix=HEX +# RUN: llvm-readobj --decompress -p .strings %t | FileCheck %s --check-prefix=STR + +# HEX: Hex dump of section '.strings': +# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st +# HEX-NEXT: 0x00000010 72696e67 7300 rings. + +# STR: String dump of section '.strings': +# STR-NEXT: [ 0] here +# STR-NEXT: [ 5] are +# STR-NEXT: [ 9] some +# STR-NEXT: [ e] strings + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL +Sections: + - Name: .strings + Type: SHT_PROGBITS + Flags: [SHF_COMPRESSED] + Content: 02000000000000001600000000000000000000000000000028b52ffd2016b10000686572650061726500736f6d6500737472696e677300 diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp index 59060ac217e32..0d3fea71aafd4 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.cpp +++ b/llvm/tools/llvm-readobj/ObjDumper.cpp @@ -14,6 +14,7 @@ #include "ObjDumper.h" #include "llvm-readobj.h" #include "llvm/Object/Archive.h" +#include "llvm/Object/Decompressor.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" @@ -142,8 +143,23 @@ getSectionRefsByNameOrIndex(const object::ObjectFile &Obj, return Ret; } +static void maybeDecompress(const object::ObjectFile &Obj, + StringRef SectionName, StringRef &SectionContent, + SmallString<0> &Out) { + Expected Decompressor = object::Decompressor::create( + SectionName, SectionContent, Obj.isLittleEndian(), Obj.is64Bit()); + if (!Decompressor) + reportWarning(Decompressor.takeError(), Obj.getFileName()); + else if (auto Err = Decompressor->resizeAndDecompress(Out)) + reportWarning(std::move(Err), Obj.getFileName()); + else + SectionContent = Out; +} + void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj, - ArrayRef Sections) { + ArrayRef Sections, + bool Decompress) { + SmallString<0> Out; bool First = true; for (object::SectionRef Section : getSectionRefsByNameOrIndex(Obj, Sections)) { @@ -156,12 +172,16 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj, StringRef SectionContent = unwrapOrError(Obj.getFileName(), Section.getContents()); + if (Decompress && Section.isCompressed()) + maybeDecompress(Obj, SectionName, SectionContent, Out); printAsStringList(SectionContent); } } void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj, - ArrayRef Sections) { + ArrayRef Sections, + bool Decompress) { + SmallString<0> Out; bool First = true; for (object::SectionRef Section : getSectionRefsByNameOrIndex(Obj, Sections)) { @@ -174,6 +194,8 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj, StringRef SectionContent = unwrapOrError(Obj.getFileName(), Section.getContents()); + if (Decompress && Section.isCompressed()) + maybeDecompress(Obj, SectionName, SectionContent, Out); const uint8_t *SecContent = SectionContent.bytes_begin(); const uint8_t *SecEnd = SecContent + SectionContent.size(); diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h index 1d679453581bc..3958dd3a33333 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.h +++ b/llvm/tools/llvm-readobj/ObjDumper.h @@ -175,9 +175,9 @@ class ObjDumper { void printAsStringList(StringRef StringContent, size_t StringDataOffset = 0); void printSectionsAsString(const object::ObjectFile &Obj, - ArrayRef Sections); + ArrayRef Sections, bool Decompress); void printSectionsAsHex(const object::ObjectFile &Obj, - ArrayRef Sections); + ArrayRef Sections, bool Decompress); std::function WarningHandler; void reportUniqueWarning(Error Err) const; diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td index e2d93c6ec229e..018facc278e89 100644 --- a/llvm/tools/llvm-readobj/Opts.td +++ b/llvm/tools/llvm-readobj/Opts.td @@ -20,6 +20,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, -- def arch_specific : FF<"arch-specific", "Display architecture-specific information">; def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">; def cg_profile : FF<"cg-profile", "Display call graph profile section">; +def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">; defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">; def dependent_libraries : FF<"dependent-libraries", "Display the dependent libraries section">; def dyn_relocations : FF<"dyn-relocations", "Display the dynamic relocation entries in the file">; @@ -139,3 +140,4 @@ def : F<"u", "Alias for --unwind">, Alias; def : F<"X", "Alias for --extra-sym-info">, Alias, Group; def : F<"V", "Alias for --version-info">, Alias, Group; def : JoinedOrSeparate<["-"], "x">, Alias, HelpText<"Alias for --hex-dump">, MetaVarName<"">; +def : F<"z", "Alias for --decompress">, Alias; diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp index f9d605d35244b..979433d69011c 100644 --- a/llvm/tools/llvm-readobj/llvm-readobj.cpp +++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp @@ -97,6 +97,7 @@ static bool ArchSpecificInfo; static bool BBAddrMap; bool ExpandRelocs; static bool CGProfile; +static bool Decompress; bool Demangle; static bool DependentLibraries; static bool DynRelocs; @@ -212,6 +213,7 @@ static void parseOptions(const opt::InputArgList &Args) { opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific); opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map); opts::CGProfile = Args.hasArg(OPT_cg_profile); + opts::Decompress = Args.hasArg(OPT_decompress); opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false); opts::DependentLibraries = Args.hasArg(OPT_dependent_libraries); opts::DynRelocs = Args.hasArg(OPT_dyn_relocations); @@ -439,9 +441,9 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer, Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols, opts::ExtraSymInfo, SymComp); if (!opts::StringDump.empty()) - Dumper->printSectionsAsString(Obj, opts::StringDump); + Dumper->printSectionsAsString(Obj, opts::StringDump, opts::Decompress); if (!opts::HexDump.empty()) - Dumper->printSectionsAsHex(Obj, opts::HexDump); + Dumper->printSectionsAsHex(Obj, opts::HexDump, opts::Decompress); if (opts::HashTable) Dumper->printHashTable(); if (opts::GnuHashTable) From 163eaf3bbc24e46a6ec9b71deda8c66f0354d2d7 Mon Sep 17 00:00:00 2001 From: Daniel Hoekwater Date: Thu, 22 Feb 2024 03:30:28 +0000 Subject: [PATCH 240/351] [CodeGen] Clean up MachineFunctionSplitter MBB safety checking (NFC) Move the "is MBB safe to split" check out of `isColdBlock` and update the comment since we're no longer using a temporary hack. --- llvm/lib/CodeGen/MachineFunctionSplitter.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index 38c1c56d2823e..0ddd945896992 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -109,12 +109,6 @@ static bool isColdBlock(const MachineBasicBlock &MBB, const MachineBlockFrequencyInfo *MBFI, ProfileSummaryInfo *PSI) { std::optional Count = MBFI->getBlockProfileCount(&MBB); - - // Temporary hack to cope with AArch64's jump table encoding - const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo(); - if (!TII.isMBBSafeToSplitToCold(MBB)) - return false; - // For instrumentation profiles and sample profiles, we use different ways // to judge whether a block is cold and should be split. if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) { @@ -178,7 +172,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { if (MBB.isEHPad()) LandingPads.push_back(&MBB); - else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode) + else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && + TII.isMBBSafeToSplitToCold(MBB) && !SplitAllEHCode) MBB.setSectionID(MBBSectionID::ColdSectionID); } @@ -190,7 +185,7 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { // Here we have UseProfileData == true. bool HasHotLandingPads = false; for (const MachineBasicBlock *LP : LandingPads) { - if (!isColdBlock(*LP, MBFI, PSI)) + if (!isColdBlock(*LP, MBFI, PSI) || !TII.isMBBSafeToSplitToCold(*LP)) HasHotLandingPads = true; } if (!HasHotLandingPads) { From 6599c022be7c797cd0fafeea4c538e01aae78fd4 Mon Sep 17 00:00:00 2001 From: yandalur Date: Thu, 22 Feb 2024 23:18:06 +0530 Subject: [PATCH 241/351] [HEXAGON] Fix bit boundary for isub_hi in HexagonBitSimplify (#82336) Use bit boundary of 32 for high subregisters in HexagonBitSimplify. This fixes the subregister used in an upper half register store. --- .../lib/Target/Hexagon/HexagonBitSimplify.cpp | 3 ++- .../Hexagon/bit-store-upper-sub-hi.mir | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 6024d9f7b1547..3b8234c011843 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) { return false; const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); RegHalf H; - if (!matchHalf(0, RC, 0, H)) + unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0; + if (!matchHalf(0, RC, B, H)) return false; if (H.Low) return false; diff --git a/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir new file mode 100644 index 0000000000000..ef84043cf5021 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir @@ -0,0 +1,21 @@ +# RUN: llc -march=hexagon -run-pass=hexagon-bit-simplify -o - %s | FileCheck %s + +# This test checks if the HexagonBitSimplify pass correctly replaces a +# S2_storerh_io with a S2_storerf_io that stores the upper halfword +# of a high subregister using appropriate subregister boundaries. + +# CHECK: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_hi +# CHECK-NOT: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_lo + +--- +name: test_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0 + %0:intregs = COPY $r0 + %1:doubleregs = IMPLICIT_DEF + %2:doubleregs = IMPLICIT_DEF + %3:doubleregs = S2_shuffoh %2, %1 + S2_storerh_io %0, 28, %3.isub_hi +... From b0edc1c45284586fdb12edd666f95d99f5f62b43 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 23 Feb 2024 01:49:19 +0800 Subject: [PATCH 242/351] [Loads] Fix crash in isSafeToLoadUnconditionally with scalable accessed type (#82650) This fixes #82606 by updating isSafeToLoadUnconditionally to handle fixed sized loads from a scalable accessed type. --- llvm/lib/Analysis/Loads.cpp | 6 +++--- .../VectorCombine/RISCV/load-widening.ll | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 6bf0d2f56eb4e..5916d2ab48ece 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size, if (Size.getBitWidth() > 64) return false; - const uint64_t LoadSize = Size.getZExtValue(); + const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue()); // Otherwise, be a little bit aggressive by scanning the local block where we // want to check to see if the pointer is already being loaded or stored @@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size, // Handle trivial cases. if (AccessedPtr == V && - LoadSize <= DL.getTypeStoreSize(AccessedTy)) + TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy))) return true; if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) && - LoadSize <= DL.getTypeStoreSize(AccessedTy)) + TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy))) return true; } return false; diff --git a/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll new file mode 100644 index 0000000000000..0a43ad2f9a368 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck %s + +define void @fixed_load_scalable_src(ptr %p) { +; CHECK-LABEL: define void @fixed_load_scalable_src( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store zeroinitializer, ptr [[P]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> zeroinitializer, <8 x i32> +; CHECK-NEXT: ret void +; +entry: + store zeroinitializer, ptr %p + %0 = load <4 x i16>, ptr %p + %1 = shufflevector <4 x i16> %0, <4 x i16> zeroinitializer, <8 x i32> + ret void +} From 5b079af169cd04b457465fd7ca31714efeefe6d9 Mon Sep 17 00:00:00 2001 From: Michael Jones <71531609+michaelrj-google@users.noreply.github.com> Date: Thu, 22 Feb 2024 09:52:16 -0800 Subject: [PATCH 243/351] [libc] add FXBits class (#82065) The FXBits class is what will be used to modify fixed point numbers on a bit level. This patch adds a basic implementation as well as basic tests. --- libc/src/__support/fixed_point/CMakeLists.txt | 2 + libc/src/__support/fixed_point/fx_bits.h | 78 ++++ libc/test/src/__support/CMakeLists.txt | 1 + .../test/src/__support/FPUtil/fpbits_test.cpp | 2 +- .../src/__support/fixed_point/CMakeLists.txt | 16 + .../__support/fixed_point/fx_bits_test.cpp | 348 ++++++++++++++++++ 6 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 libc/test/src/__support/fixed_point/CMakeLists.txt create mode 100644 libc/test/src/__support/fixed_point/fx_bits_test.cpp diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt index c6bb9e17adfa8..64f9dacc7ba5f 100644 --- a/libc/src/__support/fixed_point/CMakeLists.txt +++ b/libc/src/__support/fixed_point/CMakeLists.txt @@ -17,5 +17,7 @@ add_header_library( libc.include.llvm-libc-macros.stdfix_macros libc.src.__support.macros.attributes libc.src.__support.macros.optimization + libc.src.__support.CPP.type_traits libc.src.__support.CPP.bit + libc.src.__support.math_extras ) diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h index b26be169a593a..fcd47cd72cbb3 100644 --- a/libc/src/__support/fixed_point/fx_bits.h +++ b/libc/src/__support/fixed_point/fx_bits.h @@ -14,6 +14,7 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math_extras.h" #include "fx_rep.h" @@ -21,6 +22,83 @@ namespace LIBC_NAMESPACE::fixed_point { +template struct FXBits { +private: + using fx_rep = FXRep; + using StorageType = typename fx_rep::StorageType; + + StorageType value; + + static_assert(fx_rep::FRACTION_LEN > 0); + + static constexpr size_t FRACTION_OFFSET = 0; // Just for completeness + static constexpr size_t INTEGRAL_OFFSET = + fx_rep::INTEGRAL_LEN == 0 ? 0 : fx_rep::FRACTION_LEN; + static constexpr size_t SIGN_OFFSET = + fx_rep::SIGN_LEN == 0 + ? 0 + : ((sizeof(StorageType) * CHAR_BIT) - fx_rep::SIGN_LEN); + + static constexpr StorageType FRACTION_MASK = + mask_trailing_ones() + << FRACTION_OFFSET; + static constexpr StorageType INTEGRAL_MASK = + mask_trailing_ones() + << INTEGRAL_OFFSET; + static constexpr StorageType SIGN_MASK = + (fx_rep::SIGN_LEN == 0 ? 0 : StorageType(1) << SIGN_OFFSET); + +public: + LIBC_INLINE constexpr FXBits() = default; + + template LIBC_INLINE constexpr explicit FXBits(XType x) { + using Unqual = typename cpp::remove_cv_t; + if constexpr (cpp::is_same_v) { + value = cpp::bit_cast(x); + } else if constexpr (cpp::is_same_v) { + value = x; + } else { + // We don't want accidental type promotions/conversions, so we require + // exact type match. + static_assert(cpp::always_false); + } + } + + LIBC_INLINE constexpr StorageType get_fraction() { + return (value & FRACTION_MASK) >> FRACTION_OFFSET; + } + + LIBC_INLINE constexpr StorageType get_integral() { + return (value & INTEGRAL_MASK) >> INTEGRAL_OFFSET; + } + + // TODO: replace bool with Sign + LIBC_INLINE constexpr bool get_sign() { + return static_cast((value & SIGN_MASK) >> SIGN_OFFSET); + } + + // This represents the effective negative exponent applied to this number + LIBC_INLINE constexpr int get_exponent() { return fx_rep::FRACTION_LEN; } + + LIBC_INLINE constexpr void set_fraction(StorageType fraction) { + value = (value & (~FRACTION_MASK)) | + ((fraction << FRACTION_OFFSET) & FRACTION_MASK); + } + + LIBC_INLINE constexpr void set_integral(StorageType integral) { + value = (value & (~INTEGRAL_MASK)) | + ((integral << INTEGRAL_OFFSET) & INTEGRAL_MASK); + } + + // TODO: replace bool with Sign + LIBC_INLINE constexpr void set_sign(bool sign) { + value = (value & (~SIGN_MASK)) | + ((static_cast(sign) << SIGN_OFFSET) & SIGN_MASK); + } + + LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast(value); } +}; + // Bit-wise operations are not available for fixed point types yet. template LIBC_INLINE constexpr cpp::enable_if_t, T> diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 493ef9ddabe1e..9801621e6b399 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -188,4 +188,5 @@ add_subdirectory(File) add_subdirectory(RPC) add_subdirectory(OSUtil) add_subdirectory(FPUtil) +add_subdirectory(fixed_point) add_subdirectory(HashTable) diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp index 46f7d25059687..4f9f53afe5478 100644 --- a/libc/test/src/__support/FPUtil/fpbits_test.cpp +++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp @@ -1,4 +1,4 @@ -//===-- Unittests for the DyadicFloat class -------------------------------===// +//===-- Unittests for the FPBits class ------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/libc/test/src/__support/fixed_point/CMakeLists.txt b/libc/test/src/__support/fixed_point/CMakeLists.txt new file mode 100644 index 0000000000000..384cc9394ee79 --- /dev/null +++ b/libc/test/src/__support/fixed_point/CMakeLists.txt @@ -0,0 +1,16 @@ +if(NOT LIBC_COMPILER_HAS_FIXED_POINT) + return() +endif() + +add_custom_target(libc-fixed-point-tests) + +add_libc_test( + fx_bits_test + SUITE + libc-fixed-point-tests + SRCS + fx_bits_test.cpp + DEPENDS + libc.src.__support.fixed_point.fx_bits + libc.src.__support.integer_literals +) diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp new file mode 100644 index 0000000000000..58627816eb8d9 --- /dev/null +++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp @@ -0,0 +1,348 @@ +//===-- Unittests for the FXBits class ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "include/llvm-libc-macros/stdfix-macros.h" + +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/integer_literals.h" +#include "test/UnitTest/Test.h" + +using LIBC_NAMESPACE::fixed_point::FXBits; +using LIBC_NAMESPACE::fixed_point::FXRep; + +using LIBC_NAMESPACE::operator""_u8; +using LIBC_NAMESPACE::operator""_u16; +using LIBC_NAMESPACE::operator""_u32; +using LIBC_NAMESPACE::operator""_u64; + +// -------------------------------- SHORT TESTS -------------------------------- + +TEST(LlvmLibcFxBitsTest, FXBits_UnsignedShortFract) { + auto bits_var = FXBits(0b00000000_u8); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x00_u8); + + // Since an unsigned fract has no sign or integral components, setting either + // should have no effect. + + bits_var.set_sign(true); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x00_u8); + + bits_var.set_integral(0xab); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x00_u8); + + // but setting the fraction should work + + bits_var.set_fraction(0xcd); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0xcd_u8); +} + +TEST(LlvmLibcFxBitsTest, FXBits_UnsignedShortAccum) { + auto bits_var = FXBits(0b00000000'00000000_u16); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_sign(true); // 0 sign bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_integral(0xabcd); // 8 integral bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_fraction(0x21fe); // 8 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x00fe_u16); +} + +TEST(LlvmLibcFxBitsTest, FXBits_ShortFract) { + auto bits_var = FXBits(0b0'0000000_u8); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x00_u8); + + bits_var.set_sign(true); // 1 sign bit used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x00_u8); + + bits_var.set_integral(0xab); // 0 integral bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x00_u8); + + bits_var.set_fraction(0xcd); // 7 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00_u8); + EXPECT_EQ(bits_var.get_fraction(), 0x4d_u8); +} + +TEST(LlvmLibcFxBitsTest, FXBits_ShortAccum) { + auto bits_var = FXBits(0b0'00000000'0000000_u16); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_sign(true); // 1 sign bit used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_integral(0xabcd); // 8 integral bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_fraction(0x21fe); // 7 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x007e_u16); +} + +TEST(LlvmLibcFxBitsTest, FXBits_UnsignedFract) { + auto bits_var = FXBits(0b0000000000000000_u16); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_sign(true); // 0 sign bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_integral(0xabcd); // 0 integral bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_fraction(0xef12); // 16 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0xef12_u16); +} + +// -------------------------------- NORMAL TESTS ------------------------------- + +TEST(LlvmLibcFxBitsTest, FXBits_UnsignedAccum) { + auto bits_var = + FXBits(0b0000000000000000'0000000000000000_u32); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_sign(true); // 0 sign bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_integral(0xabcd); // 16 integral bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_fraction(0xef12); // 16 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x0000ef12_u32); +} + +TEST(LlvmLibcFxBitsTest, FXBits_Fract) { + auto bits_var = FXBits(0b0'000000000000000_u16); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_sign(true); // 1 sign bit used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_integral(0xabcd); // 0 integral bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16); + + bits_var.set_fraction(0xef12); // 15 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000_u16); + EXPECT_EQ(bits_var.get_fraction(), 0x6f12_u16); +} + +TEST(LlvmLibcFxBitsTest, FXBits_Accum) { + auto bits_var = FXBits(0b0'0000000000000000'000000000000000_u32); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_sign(true); // 1 sign bit used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_integral(0xabcd); // 16 integral bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_fraction(0xef12); // 15 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00006f12_u32); +} + +// --------------------------------- LONG TESTS -------------------------------- + +TEST(LlvmLibcFxBitsTest, FXBits_UnsignedLongFract) { + auto bits_var = + FXBits(0b00000000000000000000000000000000_u32); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_sign(true); // 0 sign bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_integral(0xabcdef12); // 0 integral bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_fraction(0xfedcba98); // 32 integral bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0xfedcba98_u32); +} + +TEST(LlvmLibcFxBitsTest, FXBits_UnsignedLongAccum) { + auto bits_var = FXBits( + 0b00000000000000000000000000000000'00000000000000000000000000000000_u64); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64); + + bits_var.set_sign(true); // 0 sign bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64); + + bits_var.set_integral(0xabcdef12); // 32 integral bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64); + + bits_var.set_fraction(0xfedcba98); // 32 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000fedcba98_u64); +} + +TEST(LlvmLibcFxBitsTest, FXBits_LongFract) { + auto bits_var = FXBits(0b0'0000000000000000000000000000000_u32); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_sign(true); // 1 sign bit used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_integral(0xabcdef12); // 0 integral bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32); + + bits_var.set_fraction(0xfedcba98); // 31 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32); + EXPECT_EQ(bits_var.get_fraction(), 0x7edcba98_u32); +} + +TEST(LlvmLibcFxBitsTest, FXBits_LongAccum) { + auto bits_var = FXBits( + 0b0'00000000000000000000000000000000'0000000000000000000000000000000_u64); + + EXPECT_EQ(bits_var.get_sign(), false); + EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64); + + bits_var.set_sign(true); // 1 sign bit used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64); + + bits_var.set_integral(0xabcdef12); // 32 integral bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64); + + bits_var.set_fraction(0xfedcba98); // 31 fractional bits used + + EXPECT_EQ(bits_var.get_sign(), true); + EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64); + EXPECT_EQ(bits_var.get_fraction(), 0x000000007edcba98_u64); +} From 3a85594cb340aabe7ad993eb3912987f4246925e Mon Sep 17 00:00:00 2001 From: sethp Date: Thu, 22 Feb 2024 09:52:48 -0800 Subject: [PATCH 244/351] [NFC] Fix typo in ReleaseNotes.rst (#82655) Deletes the leading 7 from the textual issue number. --- clang/docs/ReleaseNotes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bac166e6c3562..d8f8a2cb38442 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -270,7 +270,7 @@ Bug Fixes to C++ Support local variable, which is supported as a C11 extension in C++. Previously, it was only accepted at namespace scope but not at local function scope. - Clang no longer tries to call consteval constructors at runtime when they appear in a member initializer. - (`#782154 `_`) + (`#82154 `_`) - Fix crash when using an immediate-escalated function at global scope. (`#82258 `_) - Correctly immediate-escalate lambda conversion functions. From bc841bb0f8b55d18ed97440df878d0121701a317 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Thu, 22 Feb 2024 09:27:02 -0800 Subject: [PATCH 245/351] [clang] Rename installapi tests, NFC * Reduces redundancy --- clang/test/InstallAPI/{installapi-basic.test => basic.test} | 0 ...pi-driver-invalid-options.test => driver-invalid-options.test} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename clang/test/InstallAPI/{installapi-basic.test => basic.test} (100%) rename clang/test/InstallAPI/{installapi-driver-invalid-options.test => driver-invalid-options.test} (100%) diff --git a/clang/test/InstallAPI/installapi-basic.test b/clang/test/InstallAPI/basic.test similarity index 100% rename from clang/test/InstallAPI/installapi-basic.test rename to clang/test/InstallAPI/basic.test diff --git a/clang/test/InstallAPI/installapi-driver-invalid-options.test b/clang/test/InstallAPI/driver-invalid-options.test similarity index 100% rename from clang/test/InstallAPI/installapi-driver-invalid-options.test rename to clang/test/InstallAPI/driver-invalid-options.test From e630a451b457e4d8d071a2b4f102b342bbea2d02 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 22 Feb 2024 18:58:36 +0100 Subject: [PATCH 246/351] [HCS] Fix unused variable warnings. NFCI. --- llvm/lib/Transforms/IPO/HotColdSplitting.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index 5f03bd59b8cd1..5aefcbf13182c 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -716,10 +716,10 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { })) { ColdBlocks.insert(SubRegion.begin(), SubRegion.end()); - for (auto *Block : SubRegion) { - LLVM_DEBUG(dbgs() - << " contains cold block:" << Block->getName() << "\n"); - } + LLVM_DEBUG({ + for (auto *Block : SubRegion) + dbgs() << " contains cold block:" << Block->getName() << "\n"; + }); OutliningWorklist.emplace_back( std::make_pair(SubRegion[0], std::move(CE))); @@ -748,6 +748,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { Function *Outlined = extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE); assert(Outlined && "Should be outlined"); + (void)Outlined; } return true; From ea174c09342275d6c6fec48fb846eaf28fae5b51 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 12:01:52 -0600 Subject: [PATCH 247/351] [Libomptarget] Remove global ctor and use reference counting (#80499) Summary: Currently we rely on global constructors to initialize and shut down the OpenMP runtime library and plugin manager. This causes some issues because we do not have a defined lifetime that we can rely on to release and allocate resources. This patch instead adds some simple reference counted initialization and deinitialization function. A future patch will use the `deinit` interface to more intelligently handle plugin deinitilization. Right now we do nothing and rely on `atexit` inside of the plugins to tear them down. This isn't great because it limits our ability to control these things. Note that I made the `__tgt_register_lib` functions do the initialization instead of adding calls to the new runtime functions in the linker wrapper. The reason for this is because in the past it's been easier to not introduce a new function call, since sometimes the user's compiler will link against an older `libomptarget`. Maybe if we change the name with offloading in the future we can simplify this. Depends on https://github.com/llvm/llvm-project/pull/80460 --- openmp/libomptarget/include/PluginManager.h | 6 +++ openmp/libomptarget/include/omptarget.h | 6 +++ openmp/libomptarget/src/OffloadRTL.cpp | 38 +++++++++++++------ openmp/libomptarget/src/PluginManager.cpp | 2 +- openmp/libomptarget/src/exports | 2 + openmp/libomptarget/src/interface.cpp | 20 +++++++++- .../test/offloading/runtime_init.c | 30 +++++++++++++++ 7 files changed, 89 insertions(+), 15 deletions(-) create mode 100644 openmp/libomptarget/test/offloading/runtime_init.c diff --git a/openmp/libomptarget/include/PluginManager.h b/openmp/libomptarget/include/PluginManager.h index ec5d98dc8cd30..5e5306ac776f0 100644 --- a/openmp/libomptarget/include/PluginManager.h +++ b/openmp/libomptarget/include/PluginManager.h @@ -206,6 +206,12 @@ struct PluginManager { ProtectedObj Devices; }; +/// Initialize the plugin manager and OpenMP runtime. +void initRuntime(); + +/// Deinitialize the plugin and delete it. +void deinitRuntime(); + extern PluginManager *PM; #endif // OMPTARGET_PLUGIN_MANAGER_H diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index c4faa23427f11..9a2bd1340e3b4 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -312,6 +312,12 @@ void *llvm_omp_target_dynamic_shared_alloc(); /// add the clauses of the requires directives in a given file void __tgt_register_requires(int64_t Flags); +/// Initializes the runtime library. +void __tgt_rtl_init(); + +/// Deinitializes the runtime library. +void __tgt_rtl_deinit(); + /// adds a target shared library to the target execution image void __tgt_register_lib(__tgt_bin_desc *Desc); diff --git a/openmp/libomptarget/src/OffloadRTL.cpp b/openmp/libomptarget/src/OffloadRTL.cpp index 86ef0d5bc91cf..dd75b1b181505 100644 --- a/openmp/libomptarget/src/OffloadRTL.cpp +++ b/openmp/libomptarget/src/OffloadRTL.cpp @@ -20,25 +20,39 @@ extern void llvm::omp::target::ompt::connectLibrary(); #endif -__attribute__((constructor(101))) void init() { +static std::mutex PluginMtx; +static uint32_t RefCount = 0; + +void initRuntime() { + std::scoped_lock Lock(PluginMtx); Profiler::get(); TIMESCOPE(); - DP("Init offload library!\n"); - - PM = new PluginManager(); + if (PM == nullptr) + PM = new PluginManager(); + RefCount++; + if (RefCount == 1) { + DP("Init offload library!\n"); #ifdef OMPT_SUPPORT - // Initialize OMPT first - llvm::omp::target::ompt::connectLibrary(); + // Initialize OMPT first + llvm::omp::target::ompt::connectLibrary(); #endif - PM->init(); - - PM->registerDelayedLibraries(); + PM->init(); + PM->registerDelayedLibraries(); + } } -__attribute__((destructor(101))) void deinit() { - DP("Deinit offload library!\n"); - delete PM; +void deinitRuntime() { + std::scoped_lock Lock(PluginMtx); + assert(PM && "Runtime not initialized"); + + if (RefCount == 1) { + DP("Deinit offload library!\n"); + delete PM; + PM = nullptr; + } + + RefCount--; } diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp index 34f1f4969da30..09f9c6400569c 100644 --- a/openmp/libomptarget/src/PluginManager.cpp +++ b/openmp/libomptarget/src/PluginManager.cpp @@ -21,7 +21,7 @@ using namespace llvm; using namespace llvm::sys; -PluginManager *PM; +PluginManager *PM = nullptr; // List of all plugins that can support offloading. static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS}; diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports index af882a2642647..d5432a9eed380 100644 --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -1,5 +1,7 @@ VERS1.0 { global: + __tgt_rtl_init; + __tgt_rtl_deinit; __tgt_register_requires; __tgt_register_lib; __tgt_unregister_lib; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index d2707f39a1aa3..8b89bc3ff7124 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -38,9 +38,13 @@ EXTERN void __tgt_register_requires(int64_t Flags) { __PRETTY_FUNCTION__); } +EXTERN void __tgt_rtl_init() { initRuntime(); } +EXTERN void __tgt_rtl_deinit() { deinitRuntime(); } + //////////////////////////////////////////////////////////////////////////////// /// adds a target shared library to the target execution image EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { + initRuntime(); if (PM->delayRegisterLib(Desc)) return; @@ -49,12 +53,17 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { //////////////////////////////////////////////////////////////////////////////// /// Initialize all available devices without registering any image -EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); } +EXTERN void __tgt_init_all_rtls() { + assert(PM && "Runtime not initialized"); + PM->initAllPlugins(); +} //////////////////////////////////////////////////////////////////////////////// /// unloads a target shared library EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { PM->unregisterLib(Desc); + + deinitRuntime(); } template @@ -64,6 +73,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, map_var_info_t *ArgNames, void **ArgMappers, TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg, const char *RegionName) { + assert(PM && "Runtime not initialized"); static_assert(std::is_convertible_v, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); @@ -239,6 +249,7 @@ template static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, void *HostPtr, KernelArgsTy *KernelArgs) { + assert(PM && "Runtime not initialized"); static_assert(std::is_convertible_v, "Target AsyncInfoTy must be convertible to AsyncInfoTy."); DP("Entering target region for device %" PRId64 " with entry point " DPxMOD @@ -345,6 +356,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, bool SaveOutput, uint64_t &ReqPtrArgOffset) { + assert(PM && "Runtime not initialized"); OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); auto DeviceOrErr = PM->getDevice(DeviceId); if (!DeviceOrErr) @@ -380,7 +392,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripCount) { - + assert(PM && "Runtime not initialized"); OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); if (checkDeviceAndCtors(DeviceId, Loc)) { DP("Not offloading to device %" PRId64 "\n", DeviceId); @@ -431,6 +443,7 @@ EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, } EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) { + assert(PM && "Runtime not initialized"); std::atomic &InfoLevel = getInfoLevelInternal(); InfoLevel.store(NewInfoLevel); for (auto &R : PM->pluginAdaptors()) { @@ -440,6 +453,7 @@ EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) { } EXTERN int __tgt_print_device_info(int64_t DeviceId) { + assert(PM && "Runtime not initialized"); auto DeviceOrErr = PM->getDevice(DeviceId); if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); @@ -448,7 +462,9 @@ EXTERN int __tgt_print_device_info(int64_t DeviceId) { } EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { + assert(PM && "Runtime not initialized"); OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + if (!AsyncHandle || !*AsyncHandle) { FATAL_MESSAGE0( 1, "Receive an invalid async handle from the current OpenMP task. Is " diff --git a/openmp/libomptarget/test/offloading/runtime_init.c b/openmp/libomptarget/test/offloading/runtime_init.c new file mode 100644 index 0000000000000..96fd50f51da1e --- /dev/null +++ b/openmp/libomptarget/test/offloading/runtime_init.c @@ -0,0 +1,30 @@ +// RUN: %libomptarget-compile-generic +// RUN: env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \ +// RUN: %fcheck-generic + +// REQUIRES: libomptarget-debug + +#include +#include + +extern void __tgt_rtl_init(void); +extern void __tgt_rtl_deinit(void); + +// Sanity checks to make sure that this works and is thread safe. +int main() { + // CHECK: Init offload library! + // CHECK: Deinit offload library! + __tgt_rtl_init(); +#pragma omp parallel num_threads(8) + { + __tgt_rtl_init(); + __tgt_rtl_deinit(); + } + __tgt_rtl_deinit(); + + __tgt_rtl_init(); + __tgt_rtl_deinit(); + + // CHECK: PASS + printf("PASS\n"); +} From ec24094b56793478909783c1156fd57ce5ec2006 Mon Sep 17 00:00:00 2001 From: Igor Kudrin Date: Fri, 23 Feb 2024 01:05:06 +0700 Subject: [PATCH 248/351] [LTO] Remove Config.UseDefaultPipeline (#82587) This option is not used. It was added in [D122133](https://reviews.llvm.org/D122133), 5856f30b, with the only usage in `ClangLinkerWrapper.cpp`, which was later updated in a1d57fc2, and then finally removed in [D142650](https://reviews.llvm.org/D142650), 6185246f. --- llvm/include/llvm/LTO/Config.h | 3 --- llvm/lib/LTO/LTOBackend.cpp | 2 -- 2 files changed, 5 deletions(-) diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 6fb55f1cf1686..482b6e55a19d3 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -60,9 +60,6 @@ struct Config { bool VerifyEach = false; bool DisableVerify = false; - /// Use the standard optimization pipeline. - bool UseDefaultPipeline = false; - /// Flag to indicate that the optimizer should not assume builtins are present /// on the target. bool Freestanding = false; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 7b3a7590dfa74..6cfe67779b1a7 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -330,8 +330,6 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, report_fatal_error(Twine("unable to parse pass pipeline description '") + Conf.OptPipeline + "': " + toString(std::move(Err))); } - } else if (Conf.UseDefaultPipeline) { - MPM.addPass(PB.buildPerModuleDefaultPipeline(OL)); } else if (IsThinLTO) { MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary)); } else { From 54a6cf15069e7e88125477e0b3ce1ab063c893c6 Mon Sep 17 00:00:00 2001 From: "S. Bharadwaj Yadavalli" Date: Thu, 22 Feb 2024 13:10:58 -0500 Subject: [PATCH 249/351] [DirectX][NFC] Use LLVM Types in DXIL Operation specifications in DXIL.td (#81692) This change uniformly uses LLVM Types in the specification of parameter types and overload types of DXIL operation. Updated (a) parameter types accordingly in the specification of existing DXILOperations and (b) DXILEmitter. --- llvm/lib/Target/DirectX/DXIL.td | 80 ++++++++++++----------------- llvm/utils/TableGen/DXILEmitter.cpp | 79 +++++++++++++--------------- 2 files changed, 69 insertions(+), 90 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 52158139a2584..8a3454c89542c 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">; def UnaryFloatCategory : DXILOpCategory<"Unary float">; def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">; -// Following are the scalar types supported by DXIL operations and are synonymous -// to llvm_*_ty defined for readability and ease of use in the context of this file. - -def voidTy : LLVMType; - -// Floating point types -def f16Ty : LLVMType; -def f32Ty : LLVMType; -def f64Ty : LLVMType; - -// Integer types -def i1Ty : LLVMType; -def i8Ty : LLVMType; -def i16Ty : LLVMType; -def i32Ty : LLVMType; -def i64Ty : LLVMType; +// Represent as any pointer type with an option to change to a qualified pointer +// type with address space specified. +def dxil_handle_ty : LLVMAnyPointerType; +def dxil_cbuffer_ty : LLVMAnyPointerType; +def dxil_resource_ty : LLVMAnyPointerType; // The parameter description for a DXIL operation -class DXILOpParameter { int Pos = pos; // Position in parameter list - string Type = type; // LLVM type name, $o for overload, $r for resource - // type, $cb for legacy cbuffer, $u4 for u4 struct + LLVMType ParamType = type; // Parameter type string Name = name; // Short, unique parameter name string Doc = doc; // Description of this parameter bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR @@ -108,55 +96,55 @@ class DXILOperation { Intrinsic llvm_intrinsic = llvm_intrinsic_; } def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.", - [f16Ty,f32Ty], ReadNone, + [llvm_half_ty, llvm_float_ty], ReadNone, [ - DXILOpParameter<0, "$o", "", "operation result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "$o", "value", "input value"> + DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value"> ], ["floats"]>, LLVMIntrinsic; -def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", - [i16Ty,i32Ty,i64Ty], ReadNone, +def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b", + [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone, [ - DXILOpParameter<0, "$o", "", "operation result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "$o", "a", "input value">, - DXILOpParameter<3, "$o", "b", "input value"> + DXILOpParameter<0, llvm_anyint_ty, "", "operation result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_anyint_ty, "a", "input value">, + DXILOpParameter<3, llvm_anyint_ty, "b", "input value"> ], ["uints"]>, LLVMIntrinsic; -def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty], ReadNone, +def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "thread ID component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic; -def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty], ReadNone, +def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "group ID component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read"> + DXILOpParameter<0, llvm_i32_ty, "", "group ID component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read"> ]>, LLVMIntrinsic; -def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, - "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty], ReadNone, +def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory, + "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "thread ID in group component">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode">, - DXILOpParameter<2, "i32", "component", "component to read (x,y,z)"> + DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">, + DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)"> ]>, LLVMIntrinsic; -def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, - "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty], ReadNone, +def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory, + "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone, [ - DXILOpParameter<0, "i32", "", "result">, - DXILOpParameter<1, "i32", "opcode", "DXIL opcode"> + DXILOpParameter<0, llvm_i32_ty, "", "result">, + DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode"> ]>, LLVMIntrinsic; diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 768e8052975b7..d47df597d53a3 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -74,44 +74,32 @@ struct DXILOperationDesc { }; } // end anonymous namespace -// Convert DXIL type name string to dxil::ParameterKind -// -// @param typeNameStr Type name string -// @return ParameterKind as defined in llvm/Support/DXILABI.h -static ParameterKind getDXILTypeNameToKind(StringRef typeNameStr) { - return StringSwitch(typeNameStr) - .Case("voidTy", ParameterKind::VOID) - .Case("f16Ty", ParameterKind::HALF) - .Case("f32Ty", ParameterKind::FLOAT) - .Case("f64Ty", ParameterKind::DOUBLE) - .Case("i1Ty", ParameterKind::I1) - .Case("i8Ty", ParameterKind::I8) - .Case("i16Ty", ParameterKind::I16) - .Case("i32Ty", ParameterKind::I32) - .Case("i64Ty", ParameterKind::I64) - .Case("overloadTy", ParameterKind::OVERLOAD) - .Case("handleTy", ParameterKind::DXIL_HANDLE) - .Case("cbufferRetTy", ParameterKind::CBUFFER_RET) - .Case("resourceRetTy", ParameterKind::RESOURCE_RET) - .Default(ParameterKind::INVALID); -} - -static ParameterKind parameterTypeNameToKind(StringRef Name) { - return StringSwitch(Name) - .Case("void", ParameterKind::VOID) - .Case("half", ParameterKind::HALF) - .Case("float", ParameterKind::FLOAT) - .Case("double", ParameterKind::DOUBLE) - .Case("i1", ParameterKind::I1) - .Case("i8", ParameterKind::I8) - .Case("i16", ParameterKind::I16) - .Case("i32", ParameterKind::I32) - .Case("i64", ParameterKind::I64) - .Case("$o", ParameterKind::OVERLOAD) - .Case("dx.types.Handle", ParameterKind::DXIL_HANDLE) - .Case("dx.types.CBufRet", ParameterKind::CBUFFER_RET) - .Case("dx.types.ResRet", ParameterKind::RESOURCE_RET) - .Default(ParameterKind::INVALID); +/*! + Convert DXIL type name string to dxil::ParameterKind + + @param typeNameStr Type name string + @return ParameterKind As defined in llvm/Support/DXILABI.h +*/ +static ParameterKind lookupParameterKind(StringRef typeNameStr) { + auto paramKind = StringSwitch(typeNameStr) + .Case("llvm_void_ty", ParameterKind::VOID) + .Case("llvm_half_ty", ParameterKind::HALF) + .Case("llvm_float_ty", ParameterKind::FLOAT) + .Case("llvm_double_ty", ParameterKind::DOUBLE) + .Case("llvm_i1_ty", ParameterKind::I1) + .Case("llvm_i8_ty", ParameterKind::I8) + .Case("llvm_i16_ty", ParameterKind::I16) + .Case("llvm_i32_ty", ParameterKind::I32) + .Case("llvm_i64_ty", ParameterKind::I64) + .Case("llvm_anyfloat_ty", ParameterKind::OVERLOAD) + .Case("llvm_anyint_ty", ParameterKind::OVERLOAD) + .Case("dxil_handle_ty", ParameterKind::DXIL_HANDLE) + .Case("dxil_cbuffer_ty", ParameterKind::CBUFFER_RET) + .Case("dxil_resource_ty", ParameterKind::RESOURCE_RET) + .Default(ParameterKind::INVALID); + assert(paramKind != ParameterKind::INVALID && + "Unsupported DXIL Type specified"); + return paramKind; } DXILOperationDesc::DXILOperationDesc(const Record *R) { @@ -143,7 +131,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { for (unsigned I = 0; I < OverloadTypeList->size(); ++I) { Record *R = OverloadTypeList->getElementAsRecord(I); - OverloadTypes.emplace_back(getDXILTypeNameToKind(R->getNameInitAsString())); + OverloadTypes.emplace_back(lookupParameterKind(R->getNameInitAsString())); } Attr = StringRef(R->getValue("Attribute")->getNameInitAsString()); } @@ -151,7 +139,8 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { DXILParameter::DXILParameter(const Record *R) { Name = R->getValueAsString("Name"); Pos = R->getValueAsInt("Pos"); - Kind = parameterTypeNameToKind(R->getValueAsString("Type")); + Kind = + lookupParameterKind(R->getValue("ParamType")->getValue()->getAsString()); if (R->getValue("Doc")) Doc = R->getValueAsString("Doc"); IsConst = R->getValueAsBit("IsConstant"); @@ -296,10 +285,12 @@ static void emitDXILIntrinsicMap(std::vector &Ops, OS << "\n"; } -// Convert operation attribute string to Attribute enum -// -// @param Attr string reference -// @return std::string Attribute enum string +/*! + Convert operation attribute string to Attribute enum + + @param Attr string reference + @return std::string Attribute enum string + */ static std::string emitDXILOperationAttr(StringRef Attr) { return StringSwitch(Attr) .Case("ReadNone", "Attribute::ReadNone") From 2e7cacfced573283d5424830f20333e2a6731251 Mon Sep 17 00:00:00 2001 From: Emilia Kond Date: Thu, 22 Feb 2024 20:22:05 +0200 Subject: [PATCH 250/351] [clang-format] Fix crash in TokenAnnotator (#82349) The while loop on line 3814 can cause a segmentation fault getting the Next field on a nullptr. This is because further down, on line 3823, there is another for loop, which assigns Tok to Tok->Next in its initializer. This for loop has a condition to check if the result of that isn't null. If it is, the loop is skipped and we drop back out to the outer loop, except, now Tok is null, and we try to dereference it without checking first. This patch adds a defensive check that returns if Tok->Next is null before we make it to the second for loop. Fixes https://github.com/llvm/llvm-project/issues/82328 --------- Co-authored-by: Owen Pan --- clang/lib/Format/TokenAnnotator.cpp | 2 +- clang/unittests/Format/FormatTest.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index ec7b7f4dbe347..a60d6ae197a24 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -3817,7 +3817,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { do { Tok = Tok->Next; } while (Tok && Tok->isNot(TT_OverloadedOperatorLParen)); - if (!Tok) + if (!Tok || !Tok->MatchingParen) break; const auto *LeftParen = Tok; for (Tok = Tok->Next; Tok && Tok != LeftParen->MatchingParen; diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 8282e75bd847f..b8dc01f55b4fa 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -13503,6 +13503,12 @@ TEST_F(FormatTest, IncorrectCodeUnbalancedBraces) { verifyFormat("{"); verifyFormat("#})"); verifyNoCrash("(/**/[:!] ?[)."); + verifyNoCrash("struct X {\n" + " operator iunt(\n" + "};"); + verifyNoCrash("struct Foo {\n" + " operator foo(bar\n" + "};"); } TEST_F(FormatTest, IncorrectUnbalancedBracesInMacrosWithUnicode) { From a23d4ceb8866df91334750627827a1724363e755 Mon Sep 17 00:00:00 2001 From: Greg Clayton Date: Thu, 22 Feb 2024 10:25:05 -0800 Subject: [PATCH 251/351] [lldb][llvm] Return an error instead of crashing when parsing a line table prologue. (#80769) We recently ran into some bad DWARF where the `DW_AT_stmt_list` of many compile units was randomly set to invalid values and was causing LLDB to crash due to an assertion about address sizes not matching. Instead of asserting, we should return an appropriate recoverable `llvm::Error`. --- llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp | 22 ++++++++++++++++--- .../DebugInfo/DWARF/DWARFDebugLineTest.cpp | 4 +++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index 28f05644a3aa1..572628f45fc23 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -389,9 +389,25 @@ Error DWARFDebugLine::Prologue::parse( if (getVersion() >= 5) { FormParams.AddrSize = DebugLineData.getU8(Cursor); - assert((!Cursor || DebugLineData.getAddressSize() == 0 || - DebugLineData.getAddressSize() == getAddressSize()) && - "Line table header and data extractor disagree"); + const uint8_t DataAddrSize = DebugLineData.getAddressSize(); + const uint8_t PrologueAddrSize = getAddressSize(); + if (Cursor) { + if (DataAddrSize == 0) { + if (PrologueAddrSize != 4 && PrologueAddrSize != 8) { + RecoverableErrorHandler(createStringError( + errc::not_supported, + "parsing line table prologue at offset 0x%8.8" PRIx64 + ": invalid address size %" PRIu8, + PrologueOffset, PrologueAddrSize)); + } + } else if (DataAddrSize != PrologueAddrSize) { + RecoverableErrorHandler(createStringError( + errc::not_supported, + "parsing line table prologue at offset 0x%8.8" PRIx64 ": address " + "size %" PRIu8 " doesn't match architecture address size %" PRIu8, + PrologueOffset, PrologueAddrSize, DataAddrSize)); + } + } SegSelectorSize = DebugLineData.getU8(Cursor); } diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp index d42a626fa9c1c..980b627625eef 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp @@ -823,7 +823,9 @@ TEST_F(DebugLineBasicFixture, ErrorForUnsupportedAddressSizeDefinedInHeader) { nullptr, RecordRecoverable); EXPECT_THAT_ERROR( std::move(Recoverable), - FailedWithMessage("address size 0x09 of DW_LNE_set_address opcode at " + FailedWithMessage("parsing line table prologue at offset 0x00000000: " + "invalid address size 9", + "address size 0x09 of DW_LNE_set_address opcode at " "offset 0x00000038 is unsupported")); ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded()); ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u); From da1880cc56060c9da91cbd04daa7f8aa3ea0e829 Mon Sep 17 00:00:00 2001 From: Kevin Frei Date: Thu, 22 Feb 2024 10:26:05 -0800 Subject: [PATCH 252/351] GSym aggregated output to JSON file (#81763) In order to make tooling around dwarf health easier, I've added an `--json-summary-file` option to `llvm-gsymutil` that will spit out error summary data with counts to a JSON file. I've added the same capability to `llvm-dwarfdump` in a [different PR.](https://github.com/llvm/llvm-project/pull/81762) The format of the json is: ```JSON { "error-categories": { "": {"count": 1234}, "": {"count":4321} }, "error-count": 5555 } ``` for a clean run: ```JSON { "error-categories": {}, "error-count": 0 } ``` --------- Co-authored-by: Kevin Frei --- llvm/tools/llvm-gsymutil/Opts.td | 3 +++ llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp | 29 ++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/llvm/tools/llvm-gsymutil/Opts.td b/llvm/tools/llvm-gsymutil/Opts.td index 740291479f932..3aabc8029ccbe 100644 --- a/llvm/tools/llvm-gsymutil/Opts.td +++ b/llvm/tools/llvm-gsymutil/Opts.td @@ -35,3 +35,6 @@ defm address : Eq<"address", "Lookup an address in a GSYM file">; def addresses_from_stdin : FF<"addresses-from-stdin", "Lookup addresses in a GSYM file that are read from stdin\nEach input line is expected to be of the following format: ">; +defm json_summary_file : + Eq<"json-summary-file", + "Output a categorized summary of errors into the JSON file specified.">; diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp index 2de9c76fd68c0..00a24cdb33fe1 100644 --- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp +++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp @@ -18,6 +18,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" +#include "llvm/Support/JSON.h" #include "llvm/Support/LLVMDriver.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" @@ -87,6 +88,7 @@ static std::vector InputFilenames; static std::string ConvertFilename; static std::vector ArchFilters; static std::string OutputFilename; +static std::string JsonSummaryFile; static bool Verify; static unsigned NumThreads; static uint64_t SegmentSize; @@ -138,6 +140,9 @@ static void parseArgs(int argc, char **argv) { if (const llvm::opt::Arg *A = Args.getLastArg(OPT_out_file_EQ)) OutputFilename = A->getValue(); + if (const llvm::opt::Arg *A = Args.getLastArg(OPT_json_summary_file_EQ)) + JsonSummaryFile = A->getValue(); + Verify = Args.hasArg(OPT_verify); if (const llvm::opt::Arg *A = Args.getLastArg(OPT_num_threads_EQ)) { @@ -515,10 +520,34 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) { // Call error() if we have an error and it will exit with a status of 1 if (auto Err = convertFileToGSYM(Aggregation)) error("DWARF conversion failed: ", std::move(Err)); + // Report the errors from aggregator: Aggregation.EnumerateResults([&](StringRef category, unsigned count) { OS << category << " occurred " << count << " time(s)\n"; }); + if (!JsonSummaryFile.empty()) { + std::error_code EC; + raw_fd_ostream JsonStream(JsonSummaryFile, EC, sys::fs::OF_Text); + if (EC) { + OS << "error opening aggregate error json file '" << JsonSummaryFile + << "' for writing: " << EC.message() << '\n'; + return 1; + } + + llvm::json::Object Categories; + uint64_t ErrorCount = 0; + Aggregation.EnumerateResults([&](StringRef Category, unsigned Count) { + llvm::json::Object Val; + Val.try_emplace("count", Count); + Categories.try_emplace(Category, std::move(Val)); + ErrorCount += Count; + }); + llvm::json::Object RootNode; + RootNode.try_emplace("error-categories", std::move(Categories)); + RootNode.try_emplace("error-count", ErrorCount); + + JsonStream << llvm::json::Value(std::move(RootNode)); + } return 0; } From 5c24c316496e221e1841418f0f39ccb7200c83c6 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Thu, 22 Feb 2024 22:30:31 +0400 Subject: [PATCH 253/351] [clang] Implement CWG2759 "`[[no_unique_address]` and common initial sequence" (#82607) This patch implements said defect report resolution by adding additional check to common initial sequence evaluation. Consequently, this fixes CWG2759. --- clang/docs/ReleaseNotes.rst | 6 +- clang/lib/Sema/SemaChecking.cpp | 3 + clang/test/CXX/drs/dr27xx.cpp | 97 +++++++++++++++++-- .../SemaCXX/cxx2a-ms-no-unique-address.cpp | 25 +++++ clang/test/SemaCXX/type-traits.cpp | 10 +- clang/www/cxx_dr_status.html | 2 +- 6 files changed, 128 insertions(+), 15 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d8f8a2cb38442..74bb9a07f0b13 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -83,8 +83,6 @@ C++20 Feature Support - Implemented the `__is_layout_compatible` intrinsic to support `P0466R5: Layout-compatibility and Pointer-interconvertibility Traits `_. - Note: `CWG2759: [[no_unique_address] and common initial sequence `_ - is not yet implemented. C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ @@ -108,6 +106,10 @@ Resolutions to C++ Defect Reports of two types. (`CWG1719: Layout compatibility and cv-qualification revisited `_). +- ``[[no_unique_address]]`` is now respected when evaluating layout + compatibility of two types. + (`CWG2759: [[no_unique_address] and common initial sequence `_). + C Language Changes ------------------ diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 710437b354521..7fa295ebd9404 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -19036,6 +19036,9 @@ static bool isLayoutCompatible(ASTContext &C, FieldDecl *Field1, return false; } + if (Field1->hasAttr() || + Field2->hasAttr()) + return false; return true; } diff --git a/clang/test/CXX/drs/dr27xx.cpp b/clang/test/CXX/drs/dr27xx.cpp index dd3fd5a20163f..c956c4355abd3 100644 --- a/clang/test/CXX/drs/dr27xx.cpp +++ b/clang/test/CXX/drs/dr27xx.cpp @@ -1,15 +1,98 @@ -// RUN: %clang_cc1 -std=c++98 -verify=expected %s -// RUN: %clang_cc1 -std=c++11 -verify=expected %s -// RUN: %clang_cc1 -std=c++14 -verify=expected %s -// RUN: %clang_cc1 -std=c++17 -verify=expected %s -// RUN: %clang_cc1 -std=c++20 -verify=expected %s -// RUN: %clang_cc1 -std=c++23 -verify=expected,since-cxx23 %s -// RUN: %clang_cc1 -std=c++2c -verify=expected,since-cxx23,since-cxx26 %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++98 -verify=expected %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -verify=expected %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++14 -verify=expected %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++17 -verify=expected %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++20 -verify=expected %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -verify=expected,since-cxx23 %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++2c -verify=expected,since-cxx23,since-cxx26 %s #if __cplusplus <= 202002L // expected-no-diagnostics #endif +namespace dr2759 { // dr2759: 19 +#if __cplusplus >= 201103L + +struct CStruct { + int one; + int two; +}; + +struct CEmptyStruct {}; +struct CEmptyStruct2 {}; + +struct CStructNoUniqueAddress { + int one; + [[no_unique_address]] int two; +}; + +struct CStructNoUniqueAddress2 { + int one; + [[no_unique_address]] int two; +}; + +union UnionLayout { + int a; + double b; + CStruct c; + [[no_unique_address]] CEmptyStruct d; + [[no_unique_address]] CEmptyStruct2 e; +}; + +union UnionLayout2 { + CStruct c; + int a; + CEmptyStruct2 e; + double b; + [[no_unique_address]] CEmptyStruct d; +}; + +union UnionLayout3 { + CStruct c; + int a; + double b; + [[no_unique_address]] CEmptyStruct d; +}; + +struct StructWithAnonUnion { + union { + int a; + double b; + CStruct c; + [[no_unique_address]] CEmptyStruct d; + [[no_unique_address]] CEmptyStruct2 e; + }; +}; + +struct StructWithAnonUnion2 { + union { + CStruct c; + int a; + CEmptyStruct2 e; + double b; + [[no_unique_address]] CEmptyStruct d; + }; +}; + +struct StructWithAnonUnion3 { + union { + CStruct c; + int a; + CEmptyStruct2 e; + double b; + [[no_unique_address]] CEmptyStruct d; + } u; +}; + +static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) != bool(__has_cpp_attribute(no_unique_address)), ""); +static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) != bool(__has_cpp_attribute(no_unique_address)), ""); +static_assert(!__is_layout_compatible(UnionLayout, UnionLayout2), ""); +static_assert(!__is_layout_compatible(UnionLayout, UnionLayout3), ""); +static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), ""); +static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), ""); +#endif +} // namespace dr2759 + namespace dr2789 { // dr2789: 18 #if __cplusplus >= 202302L template diff --git a/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp b/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp index 42058559a087a..822ed752fa9c7 100644 --- a/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp +++ b/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp @@ -17,3 +17,28 @@ struct [[msvc::no_unique_address]] S { // expected-error {{only applies to non-b int [[msvc::no_unique_address]] c; // expected-error {{cannot be applied to types}} unsupported-error {{cannot be applied to types}} }; + +struct CStructNoUniqueAddress { + int one; + [[no_unique_address]] int two; + // expected-warning@-1 {{unknown attribute 'no_unique_address' ignored}} +}; + +struct CStructMSVCNoUniqueAddress { + int one; + [[msvc::no_unique_address]] int two; + // unsupported-warning@-1 {{unknown attribute 'no_unique_address' ignored}} +}; + +struct CStructMSVCNoUniqueAddress2 { + int one; + [[msvc::no_unique_address]] int two; + // unsupported-warning@-1 {{unknown attribute 'no_unique_address' ignored}} +}; + +static_assert(__has_cpp_attribute(no_unique_address) == 0); +// unsupported-error@-1 {{static assertion failed due to requirement '201803L == 0'}} +static_assert(!__is_layout_compatible(CStructNoUniqueAddress, CStructMSVCNoUniqueAddress), ""); +static_assert(__is_layout_compatible(CStructMSVCNoUniqueAddress, CStructMSVCNoUniqueAddress), ""); +static_assert(!__is_layout_compatible(CStructMSVCNoUniqueAddress, CStructMSVCNoUniqueAddress2), ""); +// unsupported-error@-1 {{static assertion failed due to requirement '!__is_layout_compatible(CStructMSVCNoUniqueAddress, CStructMSVCNoUniqueAddress2)':}} diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index 2c35d5ee19a4c..23c339ebdf082 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -1768,8 +1768,8 @@ void is_layout_compatible(int n) static_assert(!__is_layout_compatible(CppStructNonStandardBySameBase, CppStructNonStandardBySameBase2), ""); static_assert(!__is_layout_compatible(CppStructNonStandardBy2ndVirtBase, CppStructNonStandardBy2ndVirtBase2), ""); static_assert(__is_layout_compatible(CStruct, CStructWithQualifiers), ""); - static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) == bool(__has_cpp_attribute(no_unique_address)), ""); // FIXME: this is CWG2759 - static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) == bool(__has_cpp_attribute(no_unique_address)), ""); // FIXME: this is CWG2759 + static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) != bool(__has_cpp_attribute(no_unique_address)), ""); + static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) != bool(__has_cpp_attribute(no_unique_address)), ""); static_assert(__is_layout_compatible(CStruct, CStructAlignment), ""); static_assert(__is_layout_compatible(CStruct, CStructAlignedMembers), ""); // FIXME: alignment of members impact common initial sequence static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds), ""); @@ -1782,10 +1782,10 @@ void is_layout_compatible(int n) static_assert(!__is_layout_compatible(void(CStruct2::*)(int), void(CStruct2::*)(char)), ""); static_assert(__is_layout_compatible(CStructNested, CStructNested2), ""); static_assert(__is_layout_compatible(UnionLayout, UnionLayout), ""); - static_assert(__is_layout_compatible(UnionLayout, UnionLayout2), ""); + static_assert(!__is_layout_compatible(UnionLayout, UnionLayout2), ""); static_assert(!__is_layout_compatible(UnionLayout, UnionLayout3), ""); - static_assert(__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), ""); - static_assert(__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), ""); + static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), ""); + static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), ""); static_assert(__is_layout_compatible(EnumLayout, EnumClassLayout), ""); static_assert(__is_layout_compatible(EnumForward, EnumForward), ""); static_assert(__is_layout_compatible(EnumForward, EnumClassForward), ""); diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 38e2cb6314266..8b638e06f4aab 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -16362,7 +16362,7 @@

C++ defect report implementation status

2759 DR [[no_unique_address] and common initial sequence - Unknown + Clang 19 2760 From cc839275164a7768451531af868fa70eb9e71cbd Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 23 Feb 2024 02:42:49 +0800 Subject: [PATCH 254/351] [CVP] Canonicalize signed minmax into unsigned (#82478) This patch turns signed minmax to unsigned to match the behavior for signed icmps. Alive2: https://alive2.llvm.org/ce/z/UAAM42 --- .../Scalar/CorrelatedValuePropagation.cpp | 25 +++++-- .../CorrelatedValuePropagation/min-max.ll | 73 ++++++++++++++++++- 2 files changed, 86 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index c71870bc1b656..6ce9eb3656c93 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -47,11 +47,6 @@ using namespace llvm; #define DEBUG_TYPE "correlated-value-propagation" -static cl::opt CanonicalizeICmpPredicatesToUnsigned( - "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden, - cl::desc("Enables canonicalization of signed relational predicates to " - "unsigned (e.g. sgt => ugt)")); - STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value"); STATISTIC(NumSelects, "Number of selects propagated"); @@ -90,6 +85,8 @@ STATISTIC(NumSaturating, "Number of saturating arithmetics converted to normal arithmetics"); STATISTIC(NumNonNull, "Number of function pointer arguments marked non-null"); STATISTIC(NumMinMax, "Number of llvm.[us]{min,max} intrinsics removed"); +STATISTIC(NumSMinMax, + "Number of llvm.s{min,max} intrinsics simplified to unsigned"); STATISTIC(NumUDivURemsNarrowedExpanded, "Number of bound udiv's/urem's expanded"); STATISTIC(NumZExt, "Number of non-negative deductions"); @@ -289,9 +286,6 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT, } static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { - if (!CanonicalizeICmpPredicatesToUnsigned) - return false; - // Only for signed relational comparisons of scalar integers. if (Cmp->getType()->isVectorTy() || !Cmp->getOperand(0)->getType()->isIntegerTy()) @@ -528,6 +522,7 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) { } // See if this min/max intrinsic always picks it's one specific operand. +// If not, check whether we can canonicalize signed minmax into unsigned version static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) { CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate()); ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0), @@ -546,6 +541,20 @@ static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) { MM->eraseFromParent(); return true; } + + if (MM->isSigned() && + ConstantRange::areInsensitiveToSignednessOfICmpPredicate(LHS_CR, + RHS_CR)) { + ++NumSMinMax; + IRBuilder<> B(MM); + MM->replaceAllUsesWith(B.CreateBinaryIntrinsic( + MM->getIntrinsicID() == Intrinsic::smin ? Intrinsic::umin + : Intrinsic::umax, + MM->getLHS(), MM->getRHS())); + MM->eraseFromParent(); + return true; + } + return false; } diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll index d21b8f2418c2e..c9ee233b5a461 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll @@ -176,8 +176,8 @@ define i8 @test15(i8 %x) { ; CHECK-LABEL: @test15( ; CHECK-NEXT: [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 41 ; CHECK-NEXT: call void @llvm.assume(i1 [[LIM]]) -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 42) -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 42) +; CHECK-NEXT: ret i8 [[TMP1]] ; %lim = icmp sge i8 %x, 41 call void @llvm.assume(i1 %lim) @@ -189,8 +189,8 @@ define i8 @test16(i8 %x) { ; CHECK-LABEL: @test16( ; CHECK-NEXT: [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 41 ; CHECK-NEXT: call void @llvm.assume(i1 [[LIM]]) -; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 42) -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 42) +; CHECK-NEXT: ret i8 [[TMP1]] ; %lim = icmp sge i8 %x, 41 call void @llvm.assume(i1 %lim) @@ -290,3 +290,68 @@ if.end: %phi = phi i64 [%val, %bb1], [0, %entry] ret i64 %phi } + +define i8 @test_smax_to_umax_nneg(i8 %a, i8 %b) { +; CHECK-LABEL: @test_smax_to_umax_nneg( +; CHECK-NEXT: [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127 +; CHECK-NEXT: [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[NNEG_A]], i8 [[NNEG_B]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %nneg_a = and i8 %a, 127 + %nneg_b = and i8 %b, 127 + %ret = call i8 @llvm.smax.i8(i8 %nneg_a, i8 %nneg_b) + ret i8 %ret +} + +define i8 @test_smax_to_umax_neg(i8 %a, i8 %b) { +; CHECK-LABEL: @test_smax_to_umax_neg( +; CHECK-NEXT: [[NEG_A:%.*]] = or i8 [[A:%.*]], -128 +; CHECK-NEXT: [[NEG_B:%.*]] = or i8 [[B:%.*]], -128 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[NEG_A]], i8 [[NEG_B]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %neg_a = or i8 %a, 128 + %neg_b = or i8 %b, 128 + %ret = call i8 @llvm.smax.i8(i8 %neg_a, i8 %neg_b) + ret i8 %ret +} + +define i8 @test_smin_to_umin_nneg(i8 %a, i8 %b) { +; CHECK-LABEL: @test_smin_to_umin_nneg( +; CHECK-NEXT: [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127 +; CHECK-NEXT: [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[NNEG_A]], i8 [[NNEG_B]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %nneg_a = and i8 %a, 127 + %nneg_b = and i8 %b, 127 + %ret = call i8 @llvm.smin.i8(i8 %nneg_a, i8 %nneg_b) + ret i8 %ret +} + +define i8 @test_smin_to_umin_neg(i8 %a, i8 %b) { +; CHECK-LABEL: @test_smin_to_umin_neg( +; CHECK-NEXT: [[NEG_A:%.*]] = or i8 [[A:%.*]], -128 +; CHECK-NEXT: [[NEG_B:%.*]] = or i8 [[B:%.*]], -128 +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[NEG_A]], i8 [[NEG_B]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %neg_a = or i8 %a, 128 + %neg_b = or i8 %b, 128 + %ret = call i8 @llvm.smin.i8(i8 %neg_a, i8 %neg_b) + ret i8 %ret +} + +define i8 @test_umax_nneg(i8 %a, i8 %b) { +; CHECK-LABEL: @test_umax_nneg( +; CHECK-NEXT: [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127 +; CHECK-NEXT: [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127 +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.umax.i8(i8 [[NNEG_A]], i8 [[NNEG_B]]) +; CHECK-NEXT: ret i8 [[RET]] +; + %nneg_a = and i8 %a, 127 + %nneg_b = and i8 %b, 127 + %ret = call i8 @llvm.umax.i8(i8 %nneg_a, i8 %nneg_b) + ret i8 %ret +} From 33a6ce18373ffd1457ebd54e930b6f02fe4c39c1 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Thu, 22 Feb 2024 13:51:31 -0500 Subject: [PATCH 255/351] [HIP] Allow partial linking for `-fgpu-rdc` (#81700) `-fgpu-rdc` mode allows device functions call device functions in different TU. However, currently all device objects have to be linked together since only one fat binary is supported. This is time consuming for AMDGPU backend since it only supports LTO. There are use cases that objects can be divided into groups in which device functions are self-contained but host functions are not. It is desirable to link/optimize/codegen the device code and generate a fatbin for each group, whereas partially link the host code with `ld -r` or generate a static library by using the `--emit-static-lib` option of clang. This avoids linking all device code together, therefore decreases the linking time for `-fgpu-rdc`. Previously, clang emits an external symbol `__hip_fatbin` for all objects for `-fgpu-rdc`. With this patch, clang emits an unique external symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a group of objects are linked together to generate a fatbin, the symbols are merged by alias and point to the same fat binary. Each group has its own fat binary. One executable or shared library can have multiple fat binaries. Device linking is done for undefined fab binary symbols only to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is introduced to facilitate debugging and tooling. Fixes: https://github.com/llvm/llvm-project/issues/77018 --- clang/lib/CodeGen/CGCUDANV.cpp | 22 +- clang/lib/CodeGen/CodeGenModule.cpp | 10 +- clang/lib/Driver/OffloadBundler.cpp | 40 ++- clang/lib/Driver/ToolChains/HIPUtility.cpp | 258 +++++++++++++++++- clang/test/CMakeLists.txt | 1 + clang/test/CodeGenCUDA/device-stub.cu | 10 +- .../test/CodeGenCUDA/host-used-device-var.cu | 5 +- clang/test/Driver/Inputs/hip.h | 25 ++ clang/test/Driver/clang-offload-bundler.c | 13 +- clang/test/Driver/hip-partial-link.hip | 97 +++++++ clang/test/Driver/hip-toolchain-rdc.hip | 38 ++- 11 files changed, 469 insertions(+), 50 deletions(-) create mode 100644 clang/test/Driver/Inputs/hip.h create mode 100644 clang/test/Driver/hip-partial-link.hip diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 5b43272bfa62f..49f93451db7bb 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // to contain the fat binary but will be populated somewhere else, // e.g. by lld through link script. FatBinStr = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, - /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, - "__hip_fatbin", nullptr, - llvm::GlobalVariable::NotThreadLocal); + CGM.getModule(), CGM.Int8Ty, + /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, + "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr, + llvm::GlobalVariable::NotThreadLocal); cast(FatBinStr)->setSection(FatbinConstantName); } @@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // thread safety of the loaded program. Therefore we can assume sequential // execution of constructor functions here. if (IsHIP) { - auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : - llvm::GlobalValue::LinkOnceAnyLinkage; + auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage + : llvm::GlobalValue::ExternalLinkage; llvm::BasicBlock *IfBlock = llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); llvm::BasicBlock *ExitBlock = @@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // of HIP ABI. GpuBinaryHandle = new llvm::GlobalVariable( TheModule, PtrTy, /*isConstant=*/false, Linkage, - /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy), - "__hip_gpubin_handle"); - if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) - GpuBinaryHandle->setComdat( - CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); + /*Initializer=*/ + CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr, + CudaGpuBinary + ? "__hip_gpubin_handle" + : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash()); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); // Prevent the weak symbol in different shared libraries being merged. if (Linkage != llvm::GlobalValue::InternalLinkage) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 77fb3a62b356e..95e457bef28ed 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -915,7 +915,15 @@ void CodeGenModule::Release() { llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external"); addCompilerUsedGlobal(GV); } - + if (LangOpts.HIP) { + // Emit a unique ID so that host and device binaries from the same + // compilation unit can be associated. + auto *GV = new llvm::GlobalVariable( + getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage, + llvm::Constant::getNullValue(Int8Ty), + "__hip_cuid_" + getContext().getCUIDHash()); + addCompilerUsedGlobal(GV); + } emitLLVMUsed(); if (SanStats) SanStats->finish(); diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index b1091aca5616f..99a34d25cfcd5 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -588,8 +588,15 @@ class ObjectFileHandler final : public FileHandler { StringRef Content = *ContentOrErr; // Copy fat object contents to the output when extracting host bundle. - if (Content.size() == 1u && Content.front() == 0) - Content = StringRef(Input.getBufferStart(), Input.getBufferSize()); + std::string ModifiedContent; + if (Content.size() == 1u && Content.front() == 0) { + auto HostBundleOrErr = getHostBundle(); + if (!HostBundleOrErr) + return HostBundleOrErr.takeError(); + + ModifiedContent = std::move(*HostBundleOrErr); + Content = ModifiedContent; + } OS.write(Content.data(), Content.size()); return Error::success(); @@ -692,6 +699,35 @@ class ObjectFileHandler final : public FileHandler { } return Error::success(); } + + Expected getHostBundle() { + TempFileHandlerRAII TempFiles; + + auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt); + if (!ModifiedObjPathOrErr) + return ModifiedObjPathOrErr.takeError(); + StringRef ModifiedObjPath = *ModifiedObjPathOrErr; + + BumpPtrAllocator Alloc; + StringSaver SS{Alloc}; + SmallVector ObjcopyArgs{"llvm-objcopy"}; + + ObjcopyArgs.push_back("--regex"); + ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*"); + ObjcopyArgs.push_back("--"); + ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front()); + ObjcopyArgs.push_back(ModifiedObjPath); + + if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs)) + return std::move(Err); + + auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath); + if (!BufOrErr) + return createStringError(BufOrErr.getError(), + "Failed to read back the modified object file"); + + return BufOrErr->get()->getBuffer().str(); + } }; /// Handler for text files. The bundled file will have the following format. diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index f692458b775de..fcecf2e1313bb 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -9,13 +9,24 @@ #include "HIPUtility.h" #include "CommonArgs.h" #include "clang/Driver/Compilation.h" +#include "clang/Driver/Options.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Object/Archive.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" +#include +#include +using namespace clang; using namespace clang::driver; using namespace clang::driver::tools; using namespace llvm::opt; +using llvm::dyn_cast; #if defined(_WIN32) || defined(_WIN64) #define NULL_FILE "nul" @@ -36,6 +47,169 @@ static std::string normalizeForBundler(const llvm::Triple &T, : T.normalize(); } +// Collect undefined __hip_fatbin* and __hip_gpubin_handle* symbols from all +// input object or archive files. +class HIPUndefinedFatBinSymbols { +public: + HIPUndefinedFatBinSymbols(const Compilation &C) + : C(C), DiagID(C.getDriver().getDiags().getCustomDiagID( + DiagnosticsEngine::Error, + "Error collecting HIP undefined fatbin symbols: %0")), + Quiet(C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)), + Verbose(C.getArgs().hasArg(options::OPT_v)) { + populateSymbols(); + if (Verbose) { + for (auto Name : FatBinSymbols) + llvm::errs() << "Found undefined HIP fatbin symbol: " << Name << "\n"; + for (auto Name : GPUBinHandleSymbols) + llvm::errs() << "Found undefined HIP gpubin handle symbol: " << Name + << "\n"; + } + } + + const std::set &getFatBinSymbols() const { + return FatBinSymbols; + } + + const std::set &getGPUBinHandleSymbols() const { + return GPUBinHandleSymbols; + } + +private: + const Compilation &C; + unsigned DiagID; + bool Quiet; + bool Verbose; + std::set FatBinSymbols; + std::set GPUBinHandleSymbols; + std::set DefinedFatBinSymbols; + std::set DefinedGPUBinHandleSymbols; + const std::string FatBinPrefix = "__hip_fatbin"; + const std::string GPUBinHandlePrefix = "__hip_gpubin_handle"; + + void populateSymbols() { + std::deque WorkList; + std::set Visited; + + for (const auto &Action : C.getActions()) + WorkList.push_back(Action); + + while (!WorkList.empty()) { + const Action *CurrentAction = WorkList.front(); + WorkList.pop_front(); + + if (!CurrentAction || !Visited.insert(CurrentAction).second) + continue; + + if (const auto *IA = dyn_cast(CurrentAction)) { + std::string ID = IA->getId().str(); + if (!ID.empty()) { + ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true); + FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str()); + GPUBinHandleSymbols.insert( + Twine(GPUBinHandlePrefix + "_" + ID).str()); + continue; + } + if (IA->getInputArg().getNumValues() == 0) + continue; + const char *Filename = IA->getInputArg().getValue(); + if (!Filename) + continue; + auto BufferOrErr = llvm::MemoryBuffer::getFile(Filename); + // Input action could be options to linker, therefore, ignore it + // if cannot read it. If it turns out to be a file that cannot be read, + // the error will be caught by the linker. + if (!BufferOrErr) + continue; + + processInput(BufferOrErr.get()->getMemBufferRef()); + } else + WorkList.insert(WorkList.end(), CurrentAction->getInputs().begin(), + CurrentAction->getInputs().end()); + } + } + + void processInput(const llvm::MemoryBufferRef &Buffer) { + // Try processing as object file first. + auto ObjFileOrErr = llvm::object::ObjectFile::createObjectFile(Buffer); + if (ObjFileOrErr) { + processSymbols(**ObjFileOrErr); + return; + } + + // Then try processing as archive files. + llvm::consumeError(ObjFileOrErr.takeError()); + auto ArchiveOrErr = llvm::object::Archive::create(Buffer); + if (ArchiveOrErr) { + llvm::Error Err = llvm::Error::success(); + llvm::object::Archive &Archive = *ArchiveOrErr.get(); + for (auto &Child : Archive.children(Err)) { + auto ChildBufOrErr = Child.getMemoryBufferRef(); + if (ChildBufOrErr) + processInput(*ChildBufOrErr); + else + errorHandler(ChildBufOrErr.takeError()); + } + + if (Err) + errorHandler(std::move(Err)); + return; + } + + // Ignore other files. + llvm::consumeError(ArchiveOrErr.takeError()); + } + + void processSymbols(const llvm::object::ObjectFile &Obj) { + for (const auto &Symbol : Obj.symbols()) { + auto FlagOrErr = Symbol.getFlags(); + if (!FlagOrErr) { + errorHandler(FlagOrErr.takeError()); + continue; + } + + auto NameOrErr = Symbol.getName(); + if (!NameOrErr) { + errorHandler(NameOrErr.takeError()); + continue; + } + llvm::StringRef Name = *NameOrErr; + + bool isUndefined = + FlagOrErr.get() & llvm::object::SymbolRef::SF_Undefined; + bool isFatBinSymbol = Name.starts_with(FatBinPrefix); + bool isGPUBinHandleSymbol = Name.starts_with(GPUBinHandlePrefix); + + // Handling for defined symbols + if (!isUndefined) { + if (isFatBinSymbol) { + DefinedFatBinSymbols.insert(Name.str()); + FatBinSymbols.erase(Name.str()); + } else if (isGPUBinHandleSymbol) { + DefinedGPUBinHandleSymbols.insert(Name.str()); + GPUBinHandleSymbols.erase(Name.str()); + } + continue; + } + + // Add undefined symbols if they are not in the defined sets + if (isFatBinSymbol && + DefinedFatBinSymbols.find(Name.str()) == DefinedFatBinSymbols.end()) + FatBinSymbols.insert(Name.str()); + else if (isGPUBinHandleSymbol && + DefinedGPUBinHandleSymbols.find(Name.str()) == + DefinedGPUBinHandleSymbols.end()) + GPUBinHandleSymbols.insert(Name.str()); + } + } + + void errorHandler(llvm::Error Err) { + if (Quiet) + return; + C.getDriver().Diag(DiagID) << llvm::toString(std::move(Err)); + } +}; + // Construct a clang-offload-bundler command to bundle code objects for // different devices into a HIP fat binary. void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA, @@ -130,26 +304,84 @@ void HIP::constructGenerateObjFileFromHIPFatBinary( auto HostTriple = C.getSingleOffloadToolChain()->getTriple(); + HIPUndefinedFatBinSymbols Symbols(C); + + std::string PrimaryHipFatbinSymbol; + std::string PrimaryGpuBinHandleSymbol; + bool FoundPrimaryHipFatbinSymbol = false; + bool FoundPrimaryGpuBinHandleSymbol = false; + + std::vector AliasHipFatbinSymbols; + std::vector AliasGpuBinHandleSymbols; + + // Iterate through symbols to find the primary ones and collect others for + // aliasing + for (const auto &Symbol : Symbols.getFatBinSymbols()) { + if (!FoundPrimaryHipFatbinSymbol) { + PrimaryHipFatbinSymbol = Symbol; + FoundPrimaryHipFatbinSymbol = true; + } else + AliasHipFatbinSymbols.push_back(Symbol); + } + + for (const auto &Symbol : Symbols.getGPUBinHandleSymbols()) { + if (!FoundPrimaryGpuBinHandleSymbol) { + PrimaryGpuBinHandleSymbol = Symbol; + FoundPrimaryGpuBinHandleSymbol = true; + } else + AliasGpuBinHandleSymbols.push_back(Symbol); + } + // Add MC directives to embed target binaries. We ensure that each // section and image is 16-byte aligned. This is not mandatory, but // increases the likelihood of data to be aligned with a cache block // in several main host machines. ObjStream << "# HIP Object Generator\n"; ObjStream << "# *** Automatically generated by Clang ***\n"; - if (HostTriple.isWindowsMSVCEnvironment()) { - ObjStream << " .section .hip_fatbin, \"dw\"\n"; - } else { - ObjStream << " .protected __hip_fatbin\n"; - ObjStream << " .type __hip_fatbin,@object\n"; - ObjStream << " .section .hip_fatbin,\"a\",@progbits\n"; + if (FoundPrimaryGpuBinHandleSymbol) { + // Define the first gpubin handle symbol + if (HostTriple.isWindowsMSVCEnvironment()) + ObjStream << " .section .hip_gpubin_handle,\"dw\"\n"; + else { + ObjStream << " .protected " << PrimaryGpuBinHandleSymbol << "\n"; + ObjStream << " .type " << PrimaryGpuBinHandleSymbol << ",@object\n"; + ObjStream << " .section .hip_gpubin_handle,\"aw\"\n"; + } + ObjStream << " .globl " << PrimaryGpuBinHandleSymbol << "\n"; + ObjStream << " .p2align 3\n"; // Align 8 + ObjStream << PrimaryGpuBinHandleSymbol << ":\n"; + ObjStream << " .zero 8\n"; // Size 8 + + // Generate alias directives for other gpubin handle symbols + for (const auto &AliasSymbol : AliasGpuBinHandleSymbols) { + ObjStream << " .globl " << AliasSymbol << "\n"; + ObjStream << " .set " << AliasSymbol << "," << PrimaryGpuBinHandleSymbol + << "\n"; + } + } + if (FoundPrimaryHipFatbinSymbol) { + // Define the first fatbin symbol + if (HostTriple.isWindowsMSVCEnvironment()) + ObjStream << " .section .hip_fatbin,\"dw\"\n"; + else { + ObjStream << " .protected " << PrimaryHipFatbinSymbol << "\n"; + ObjStream << " .type " << PrimaryHipFatbinSymbol << ",@object\n"; + ObjStream << " .section .hip_fatbin,\"a\",@progbits\n"; + } + ObjStream << " .globl " << PrimaryHipFatbinSymbol << "\n"; + ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign)) + << "\n"; + // Generate alias directives for other fatbin symbols + for (const auto &AliasSymbol : AliasHipFatbinSymbols) { + ObjStream << " .globl " << AliasSymbol << "\n"; + ObjStream << " .set " << AliasSymbol << "," << PrimaryHipFatbinSymbol + << "\n"; + } + ObjStream << PrimaryHipFatbinSymbol << ":\n"; + ObjStream << " .incbin "; + llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true); + ObjStream << "\n"; } - ObjStream << " .globl __hip_fatbin\n"; - ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign)) - << "\n"; - ObjStream << "__hip_fatbin:\n"; - ObjStream << " .incbin "; - llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true); - ObjStream << "\n"; if (HostTriple.isOSLinux() && HostTriple.isOSBinFormatELF()) ObjStream << " .section .note.GNU-stack, \"\", @progbits\n"; ObjStream.flush(); diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index 6b5cb0a18457b..fcfca354f4a75 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -136,6 +136,7 @@ if( NOT CLANG_BUILT_STANDALONE ) llvm-strip llvm-symbolizer llvm-windres + obj2yaml opt split-file yaml2obj diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu index d7a7b1bb9fe95..60304647bd4c5 100644 --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -50,21 +50,19 @@ // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ +// RUN: %clang_cc1 -cuid=123 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN -// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ +// RUN: %clang_cc1 -cuid=123 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ // RUN: -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN,HIP,HIPNEF #include "Inputs/cuda.h" -// HIPNEF: $__hip_gpubin_handle = comdat any - #ifndef NOGLOBALS // NORDC-DAG: @device_var = internal global i32 // RDC-DAG: @device_var = global i32 @@ -161,7 +159,7 @@ __device__ void device_use() { // * constant unnamed string with GPU binary // CUDA: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.", // HIPEF: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",{{.*}}align 4096 -// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" +// HIPNEF: @[[FATBIN:__hip_fatbin_[0-9a-f]+]] = external constant i8, section ".hip_fatbin" // CUDANORDC-SAME: section ".nv_fatbin", align 8 // CUDARDC-SAME: section "__nv_relfatbin", align 8 // * constant struct that wraps GPU binary @@ -177,7 +175,7 @@ __device__ void device_use() { // HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global ptr null -// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global ptr null +// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = external hidden global ptr, align 8 // * constant unnamed string with NVModuleID // CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 diff --git a/clang/test/CodeGenCUDA/host-used-device-var.cu b/clang/test/CodeGenCUDA/host-used-device-var.cu index 7cb31aff84264..5328660c9dc9d 100644 --- a/clang/test/CodeGenCUDA/host-used-device-var.cu +++ b/clang/test/CodeGenCUDA/host-used-device-var.cu @@ -1,9 +1,9 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ // RUN: -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \ -// RUN: | FileCheck -check-prefix=DEV %s +// RUN: -cuid=123 | FileCheck -check-prefix=DEV %s // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \ -// RUN: -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s +// RUN: -std=c++17 -O3 -emit-llvm -o - -cuid=123 | FileCheck -check-prefix=HOST %s // Negative tests. @@ -187,6 +187,7 @@ public: // DEV-SAME: {{^[^@]*}} @_ZL2u3 // DEV-SAME: {{^[^@]*}} @_ZZ4fun1vE11static_var1 // DEV-SAME: {{^[^@]*}} @_ZZZN21TestStaticVarInLambda3funEvENKUlPcE_clES0_E4var2 +// DEV-SAME: {{^[^@]*}} @__hip_cuid_{{[0-9a-f]+}} // DEV-SAME: {{^[^@]*}} @constexpr_var2b // DEV-SAME: {{^[^@]*}} @inline_var // DEV-SAME: {{^[^@]*}} @u1 diff --git a/clang/test/Driver/Inputs/hip.h b/clang/test/Driver/Inputs/hip.h new file mode 100644 index 0000000000000..5be772a7b3413 --- /dev/null +++ b/clang/test/Driver/Inputs/hip.h @@ -0,0 +1,25 @@ +/* Minimal declarations for HIP support. Testing purposes only. */ + +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __shared__ __attribute__((shared)) +#define __managed__ __attribute__((managed)) + +struct dim3 { + unsigned x, y, z; + __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, unsigned long long sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + unsigned long long sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + unsigned long long sharedMem, + hipStream_t stream); diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c index 7d0b6b27a60ae..9d8b81ee9806e 100644 --- a/clang/test/Driver/clang-offload-bundler.c +++ b/clang/test/Driver/clang-offload-bundler.c @@ -10,6 +10,7 @@ // RUN: %clang -O0 -target %itanium_abi_triple %s -c -emit-llvm -o %t.bc // RUN: %clang -O0 -target %itanium_abi_triple %s -S -o %t.s // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.o +// RUN: obj2yaml %t.o > %t.o.yaml // RUN: %clang -O0 -target %itanium_abi_triple %s -emit-ast -o %t.ast // @@ -305,11 +306,13 @@ // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle3.o // RUN: clang-offload-bundler -type=o -input=%t.bundle3.o -list | FileCheck -check-prefix=CKLST %s // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle -// RUN: diff %t.bundle3.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle -// RUN: diff %t.bundle3.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu -output=%t.res.tgt1 -input=%t.bundle3.o -unbundle @@ -318,11 +321,13 @@ // Check if we can unbundle a file with no magic strings. // RUN: clang-offload-bundler -type=o -input=%t.o -list | FileCheck -check-prefix=CKLST2 --allow-empty %s // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles -// RUN: diff %t.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.empty %t.res.tgt1 // RUN: diff %t.empty %t.res.tgt2 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles -// RUN: diff %t.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.empty %t.res.tgt1 // RUN: diff %t.empty %t.res.tgt2 diff --git a/clang/test/Driver/hip-partial-link.hip b/clang/test/Driver/hip-partial-link.hip new file mode 100644 index 0000000000000..a1d31f9a65195 --- /dev/null +++ b/clang/test/Driver/hip-partial-link.hip @@ -0,0 +1,97 @@ +// REQUIRES: x86-registered-target, amdgpu-registered-target, lld, system-linux + +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu \ +// RUN: --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \ +// RUN: -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.1.o + +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DLIB \ +// RUN: --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \ +// RUN: -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.2.o + +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DMAIN \ +// RUN: --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \ +// RUN: -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.main.o + +// RUN: llvm-nm %t.1.o | FileCheck -check-prefix=OBJ1 %s +// OBJ1: B __hip_cuid_[[ID:[0-9a-f]+]] +// OBJ1: U __hip_fatbin_[[ID]] +// OBJ1: U __hip_gpubin_handle_[[ID]] + +// RUN: llvm-nm %t.2.o | FileCheck -check-prefix=OBJ2 %s +// OBJ2: B __hip_cuid_[[ID:[0-9a-f]+]] +// OBJ2: U __hip_fatbin_[[ID]] +// OBJ2: U __hip_gpubin_handle_[[ID]] + +// Link %t.1.o and %t.2.o by -r and then link with %t.main.o + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -fgpu-rdc --offload-arch=gfx906 \ +// RUN: -r -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.lib.o \ +// RUN: 2>&1 | FileCheck -check-prefix=LD-R %s +// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]] +// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]] +// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]] +// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]] +// LD-R: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle +// LD-R: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu +// LD-R: "{{.*}}/clang-offload-bundler" +// LD-R: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu +// LD-R: "{{.*}}/ld.lld" {{.*}} -r + +// RUN: llvm-nm %t.lib.o | FileCheck -check-prefix=OBJ %s +// OBJ: B __hip_cuid_[[ID1:[0-9a-f]+]] +// OBJ: B __hip_cuid_[[ID2:[0-9a-f]+]] +// OBJ: R __hip_fatbin_[[ID1]] +// OBJ: R __hip_fatbin_[[ID2]] +// OBJ: D __hip_gpubin_handle_[[ID1]] +// OBJ: D __hip_gpubin_handle_[[ID2]] + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \ +// RUN: -fuse-ld=lld -nostdlib -r %t.main.o %t.lib.o -o %t.final.o \ +// RUN: 2>&1 | FileCheck -check-prefix=LINK-O %s +// LINK-O-NOT: Found undefined HIP {{.*}}symbol + +// Generate a static lib with %t.1.o and %t.2.o then link with %t.main.o + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -fgpu-rdc --offload-arch=gfx906 \ +// RUN: --emit-static-lib -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.a \ +// RUN: 2>&1 | FileCheck -check-prefix=STATIC %s +// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]] +// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]] +// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]] +// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]] +// STATIC: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle +// STATIC: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu +// STATIC: "{{.*}}/clang-offload-bundler" +// STATIC: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu +// STATIC: "{{.*}}/llvm-ar" + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \ +// RUN: -fuse-ld=lld -nostdlib -r %t.main.o %t.a -o %t.final.o \ +// RUN: 2>&1 | FileCheck -check-prefix=LINK-A %s +// LINK-A-NOT: Found undefined HIP {{.*}}symbol + +#include "hip.h" + +#ifdef LIB +__device__ int x; +__device__ void libfun() { + x = 1; +} +#elif !defined(MAIN) +__device__ void libfun(); +__global__ void kern() { + libfun(); +} +void run() { + kern<<<1,1>>>(); +} +#else +extern void run(); +int main() { + run(); +} +#endif diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 1827531f9cab7..d19d8ccd6cb29 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -1,7 +1,7 @@ // REQUIRES: x86-registered-target // REQUIRES: amdgpu-registered-target -// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: %clang -### --target=x86_64-linux-gnu -v \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ @@ -12,7 +12,7 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LNX %s -// RUN: %clang -### --target=x86_64-pc-windows-msvc \ +// RUN: %clang -### --target=x86_64-pc-windows-msvc -v \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ @@ -23,15 +23,31 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,MSVC %s -// check code object alignment in dumped llvm-mc input -// LNX: .protected __hip_fatbin -// LNX: .type __hip_fatbin,@object -// LNX: .section .hip_fatbin,"a",@progbits -// MSVC: .section .hip_fatbin, "dw" -// CHECK: .globl __hip_fatbin -// CHECK: .p2align 12 -// CHECK: __hip_fatbin: -// CHECK: .incbin "[[BUNDLE:.*hipfb]]" +// check HIP fatbin and gpubin handle symbols and code object alignment in dumped llvm-mc input +// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]] +// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]] +// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]] +// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]] +// LNX: .protected __hip_gpubin_handle_[[ID1]] +// LNX: .type __hip_gpubin_handle_[[ID1]] +// LNX-LABEL: .section .hip_gpubin_handle,"aw" +// MSVC-LABEL: .section .hip_gpubin_handle,"dw" +// CHECK: .globl __hip_gpubin_handle_[[ID1]] +// CHECK-NEXT: .p2align 3 +// CHECK-NEXT:__hip_gpubin_handle_[[ID1]]: +// CHECK-NEXT: .zero 8 +// CHECK-NEXT: .globl __hip_gpubin_handle_[[ID2]] +// CHECK-NEXT: .set __hip_gpubin_handle_[[ID2]],__hip_gpubin_handle_[[ID1]] +// LNX: .protected __hip_fatbin_[[ID1]] +// LNX: .type __hip_fatbin_[[ID1]],@object +// LNX-LABEL: .section .hip_fatbin,"a",@progbits +// MSVC-LABEL: .section .hip_fatbin,"dw" +// CHECK: .globl __hip_fatbin_[[ID1]] +// CHECK-NEXT: .p2align 12 +// CHECK-NEXT: .globl __hip_fatbin_[[ID2]] +// CHECK-NEXT: .set __hip_fatbin_[[ID2]],__hip_fatbin_[[ID1]] +// CHECK-NEXT: __hip_fatbin_[[ID1]]: +// CHECK-NEXT: .incbin "[[BUNDLE:.*hipfb]]" // LNX: .section .note.GNU-stack, "", @progbits // MSVC-NOT: .note.GNU-stack From 1069823ce7d154aa8ef87ae5a0fd34b527eca2a0 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov <6532716+alexander-shaposhnikov@users.noreply.github.com> Date: Thu, 22 Feb 2024 11:02:47 -0800 Subject: [PATCH 256/351] Enable JumpTableToSwitch pass by default (#82546) Enable JumpTableToSwitch pass by default. Test plan: ninja check-all --- llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- llvm/test/Other/new-pm-defaults.ll | 6 +----- llvm/test/Other/new-pm-thinlto-postlink-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll | 1 + .../Other/new-pm-thinlto-postlink-samplepgo-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-prelink-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll | 1 + .../test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll | 1 + 8 files changed, 8 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 142bd50b3798e..17b55b63ac03c 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -247,7 +247,7 @@ static cl::opt static cl::opt EnableJumpTableToSwitch( "enable-jump-table-to-switch", - cl::desc("Enable JumpTableToSwitch pass (default = off)")); + cl::desc("Enable JumpTableToSwitch pass (default = on)"), cl::init(true)); // This option is used in simplifying testing SampleFDO optimizations for // profile loading. diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 51fb93daa4dfa..285077ff8e31a 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -71,10 +71,6 @@ ; RUN: -passes='default' -S %s 2>&1 \ ; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ -; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ -; RUN: -passes='default' -enable-jump-table-to-switch -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext - ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -enable-matrix -S %s 2>&1 \ ; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX @@ -155,7 +151,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis -; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index 064362eabbf83..29a4d79037427 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -90,6 +90,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index 19a44867e434a..bf06782c86f86 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -78,6 +78,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index ac80a31d8fd4b..0cc61121de01c 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -86,6 +86,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll index 6486639e07b49..0e5839797afe9 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll @@ -121,6 +121,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll index 09f9f0f48badd..68c2e58146300 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -118,6 +118,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index 47bdbfd2d357d..8311a009711d1 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -90,6 +90,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis +; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass From 4f7ab789bf43b49914815bdf4e4c3703f92e781d Mon Sep 17 00:00:00 2001 From: Boian Petkantchin Date: Thu, 22 Feb 2024 11:06:14 -0800 Subject: [PATCH 257/351] [mlir][mesh] add support in spmdization for incomplete sharding annotations (#82442) Don't require that `mesh.shard` operations come in pairs. If there is only a single `mesh.shard` operation we assume that the producer result and consumer operand have the same sharding. --- .../Dialect/Mesh/Transforms/Spmdization.cpp | 45 ++++++++++++------- mlir/test/Dialect/Mesh/spmdization.mlir | 14 ++++++ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp index 7cbe0de048769..c4d8b0b15e462 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp +++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp @@ -593,7 +593,6 @@ static SmallVector getOperandShardings(Operation &op) { Operation *definingOp = operand.getDefiningOp(); assert(definingOp); ShardOp shardOp = llvm::cast(definingOp); - assert(shardOp.getAnnotateForUsers()); return shardOp.getShard(); }); return res; @@ -615,34 +614,46 @@ static SmallVector getResultShardings(Operation &op) { assert(result.hasOneUse()); Operation *userOp = *result.getUsers().begin(); ShardOp shardOp = llvm::cast(userOp); - assert(!shardOp.getAnnotateForUsers()); return shardOp.getShard(); }); return res; } static LogicalResult -spmdizeOperation(Operation &op, IRMapping &spmdizationMap, +spmdizeOperation(ShardOp shardOp, IRMapping &spmdizationMap, SymbolTableCollection &symbolTableCollection, OpBuilder &builder) { - ShardOp shardOp = llvm::dyn_cast(op); - if (shardOp) { - if (!shardOp.getAnnotateForUsers()) { - return success(); - } - + Value targetSpmdValue; + + // Check if 2 shard ops are chained. If not there is no need for resharding + // as the source and target shared the same sharding. + ShardOp srcShardOp = + dyn_cast_or_null(shardOp.getOperand().getDefiningOp()); + if (!srcShardOp) { + targetSpmdValue = spmdizationMap.lookup(shardOp.getOperand()); + } else { // Insert resharding. - ShardOp srcShardOp = - llvm::cast(shardOp.getOperand().getDefiningOp()); - assert(!srcShardOp.getAnnotateForUsers()); + assert(!srcShardOp.getAnnotateForUsers() && shardOp.getAnnotateForUsers()); TypedValue srcSpmdValue = spmdizationMap.lookup(srcShardOp.getOperand()) .cast>(); - Value targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue, - symbolTableCollection); - assert(!spmdizationMap.contains(shardOp.getResult())); - spmdizationMap.map(shardOp.getResult(), targetSpmdValue); - return success(); + targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue, + symbolTableCollection); + } + + assert(!spmdizationMap.contains(shardOp.getResult())); + spmdizationMap.map(shardOp.getResult(), targetSpmdValue); + return success(); +} + +static LogicalResult +spmdizeOperation(Operation &op, IRMapping &spmdizationMap, + SymbolTableCollection &symbolTableCollection, + OpBuilder &builder) { + ShardOp shardOp = llvm::dyn_cast(op); + if (shardOp) { + return spmdizeOperation(shardOp, spmdizationMap, symbolTableCollection, + builder); } SmallVector spmdizedOperands; diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir index 2fb8029dfe64a..572d3eb55eaaa 100644 --- a/mlir/test/Dialect/Mesh/spmdization.mlir +++ b/mlir/test/Dialect/Mesh/spmdization.mlir @@ -127,3 +127,17 @@ func.func @multiple_chained_ops( // CHECK: return %[[RESHARD3]] : tensor<1xi8> return %7 : tensor<2xi8> } + +// CHECK-LABEL: func @incomplete_sharding +func.func @incomplete_sharding( + // CHECK-SAME: %[[ARG:.*]]: tensor<4x16xf32> + %arg0: tensor<8x16xf32> +// CHECK-SAME: -> tensor<4x16xf32> { +) -> tensor<8x16xf32> { + %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> annotate_for_users : tensor<8x16xf32> + // CHECK: %[[RES:.*]] = tosa.sigmoid %[[ARG]] : (tensor<4x16xf32>) -> tensor<4x16xf32> + %1 = tosa.sigmoid %0 : (tensor<8x16xf32>) -> tensor<8x16xf32> + %2 = mesh.shard %1 to <@mesh_1d, [[0]]> : tensor<8x16xf32> + // CHECK: return %[[RES]] : tensor<4x16xf32> + return %2 : tensor<8x16xf32> +} From 744c0057e7dc0d1d046a4867cece2f31fee9bb23 Mon Sep 17 00:00:00 2001 From: Nashe Mncube Date: Thu, 22 Feb 2024 19:15:52 +0000 Subject: [PATCH 258/351] [AArch64][CodeGen] Fix crash when fptrunc returns fp16 with +nofp attr (#81724) When performing lowering of the fptrunc opcode returning fp16 with the +nofp flag enabled we could trigger a compiler crash. This is because we had no custom lowering implemented. This patch the case in which we need to promote an fp16 return type for fptrunc when the +nofp attr is enabled. --- .../Target/AArch64/AArch64ISelLowering.cpp | 14 ++- .../16bit-float-promotion-with-nofp.ll | 31 +++++ .../AArch64/strictfp_f16_abi_promote.ll | 115 +++++++++++++++--- 3 files changed, 138 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 184ebc19bc9ed..3b92e95d7c287 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFPARMv8()) + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFPARMv8()) + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); @@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); - setOperationAction(ISD::BITCAST, MVT::i16, Custom); - setOperationAction(ISD::BITCAST, MVT::f16, Custom); - setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + if (Subtarget->hasFPARMv8()) { + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::f16, Custom); + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + } // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll new file mode 100644 index 0000000000000..bfe9ab8424bb0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -mattr=-fp-armv8 -o - %s | FileCheck %s + +define half @f2h(float %a) { +; CHECK-LABEL: f2h: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = fptrunc float %a to half + ret half %0 +} + +define bfloat @f2bfloat(float %a) { +; CHECK-LABEL: f2bfloat: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl __truncsfbf2 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = fptrunc float %a to bfloat + ret bfloat %0 +} + diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll index a34f7abcc22a3..9fa5208cc8db6 100644 --- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -131,26 +131,107 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ret void } -; FIXME: -; define half @f16_return(float %arg) #0 { -; %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") -; ret half %fptrunc -; } + define half @f16_return(float %arg) #0 { +; NOFP16-LABEL: f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 +; NOFP16-NEXT: .cfi_offset w30, -16 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; NOFP16-NEXT: ret + %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret half %fptrunc + } -; define <2 x half> @v2f16_return(<2 x float> %arg) #0 { -; %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") -; ret <2 x half> %fptrunc -; } + define <2 x half> @v2f16_return(<2 x float> %arg) #0 { +; NOFP16-LABEL: v2f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 32 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w30, -32 +; NOFP16-NEXT: mov w19, w0 +; NOFP16-NEXT: mov w0, w1 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: mov w0, w19 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w1, w20 +; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; NOFP16-NEXT: ret + %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <2 x half> %fptrunc + } -; define <3 x half> @v3f16_return(<3 x float> %arg) #0 { -; %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") -; ret <3 x half> %fptrunc -; } + define <3 x half> @v3f16_return(<3 x float> %arg) #0 { +; NOFP16-LABEL: v3f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 32 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w30, -32 +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: mov w0, w2 +; NOFP16-NEXT: mov w19, w1 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: mov w0, w19 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w19, w0 +; NOFP16-NEXT: mov w0, w20 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w1, w19 +; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: mov w2, w21 +; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <3 x half> %fptrunc + } -; define <4 x half> @v4f16_return(<4 x float> %arg) #0 { -; %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") -; ret <4 x half> %fptrunc -; } + define <4 x half> @v4f16_return(<4 x float> %arg) #0 { +; NOFP16-LABEL: v4f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: mov w0, w3 +; NOFP16-NEXT: mov w19, w2 +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: mov w0, w19 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w19, w0 +; NOFP16-NEXT: mov w0, w20 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: mov w0, w21 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: mov w1, w20 +; NOFP16-NEXT: mov w2, w19 +; NOFP16-NEXT: mov w3, w22 +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; NOFP16-NEXT: ret + %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <4 x half> %fptrunc + } ; FIXME: ; define void @outgoing_f16_arg(ptr %ptr) #0 { From 6ddb25ed9ca2cb0f4ad8f402d7411ac3328f598d Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 22 Feb 2024 11:19:02 -0800 Subject: [PATCH 259/351] [scudo] increase frames per stack to 16 for stack depot (#82427) 8 was very low and it is likely that in real workloads we have more than an average of 8 frames per stack given on Android we have 3 at the bottom: __start_main, __libc_init, main, and three at the top: malloc, scudo_malloc and Allocator::allocate. That leaves 2 frames for application code, which is clearly unreasonable. --- compiler-rt/lib/scudo/standalone/combined.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index f3c3d757c9f12..f13cf9498a793 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -1522,7 +1522,12 @@ class Allocator { constexpr u32 kStacksPerRingBufferEntry = 2; constexpr u32 kMaxU32Pow2 = ~(UINT32_MAX >> 1); static_assert(isPowerOfTwo(kMaxU32Pow2)); - constexpr u32 kFramesPerStack = 8; + // On Android we always have 3 frames at the bottom: __start_main, + // __libc_init, main, and 3 at the top: malloc, scudo_malloc and + // Allocator::allocate. This leaves 10 frames for the user app. The next + // smallest power of two (8) would only leave 2, which is clearly too + // little. + constexpr u32 kFramesPerStack = 16; static_assert(isPowerOfTwo(kFramesPerStack)); // We need StackDepot to be aligned to 8-bytes so the ring we store after From 242f98c7ab7c100d76cac29b555db20205619b38 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 22 Feb 2024 20:21:09 +0100 Subject: [PATCH 260/351] [Clang][SME] Skip writing output files to the source directory --- clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c index 7eb74f28a1c85..25aebeced9379 100644 --- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c +++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_NONE %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_COMPATIBLE %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_STREAMING %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_LOCALLY %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_NONE %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_COMPATIBLE %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s #define __ai __attribute__((always_inline)) __ai void inlined_fn(void) {} From 3168af56bcb827360c26957ef579b7871dad8e17 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 22 Feb 2024 20:25:58 +0100 Subject: [PATCH 261/351] LoopVectorize: Mark crash test as requiring assertions --- llvm/test/Transforms/LoopVectorize/X86/pr72969.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll index a54bd39f3ff60..40633c6c8383b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll @@ -1,5 +1,6 @@ ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s +; REQUIRES: asserts @h = global i64 0 From 32994cc0d63513f77223c64148faeeb50aebb702 Mon Sep 17 00:00:00 2001 From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com> Date: Thu, 22 Feb 2024 14:32:15 -0500 Subject: [PATCH 262/351] [SLP]Improve findReusedOrderedScalars and graph rotation. Patch syncs the code in findReusedOrderedScalars with cost estimation/codegen. It tries to use similar logic to better determine best order. Before, it just tried to find previously vectorized node without checking if it is possible to use the vectorized value in the shuffle. Now it relies on the more generalized version. If it determines, that a single vector must be reordered (using same mechanism, as codegen and cost estimation), it generates better order. The comparison between new/ref ordering: Metric: SLP.NumVectorInstructions Program SLP.NumVectorInstructions results results0 diff test-suite :: MultiSource/Benchmarks/nbench/nbench.test 139.00 140.00 0.7% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 344.00 346.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 1293.00 1292.00 -0.1% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 5176.00 5170.00 -0.1% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 5173.00 5167.00 -0.1% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00 11660.00 -0.3% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 1621.00 1615.00 -0.4% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 795.00 792.00 -0.4% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00 26338.00 -0.6% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 7343.00 7281.00 -0.8% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 1104.00 1094.00 -0.9% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 2216.00 2180.00 -1.6% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 787.00 637.00 -19.1% Less 0% is better. Most of the benchmarks see more vectorized code. The first ones just have shuffles removed. The ordering analysis still may require some improvements (e.g. for alternate nodes), but this one should be produce better results. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/77529 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 446 ++++++++++++++---- .../AArch64/extractelements-to-shuffle.ll | 16 +- .../AArch64/reorder-fmuladd-crash.ll | 7 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 22 +- .../AArch64/vec3-reorder-reshuffle.ll | 34 +- .../Transforms/SLPVectorizer/X86/pr35497.ll | 16 +- .../SLPVectorizer/X86/reduction-transpose.ll | 16 +- .../X86/reorder-clustered-node.ll | 11 +- .../X86/reorder-reused-masked-gather.ll | 7 +- .../SLPVectorizer/X86/reorder-vf-to-resize.ll | 2 +- .../X86/scatter-vectorize-reorder.ll | 17 +- .../X86/shrink_after_reorder2.ll | 11 +- .../X86/vec3-reorder-reshuffle.ll | 17 +- 13 files changed, 447 insertions(+), 175 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4e334748c9593..de4e56ff80659 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2422,18 +2422,25 @@ class BoUpSLP { /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for /// permutations. Must form single-register vector. + /// \param ForOrder Tries to fetch the best candidates for ordering info. Also + /// commands to build the mask using the original vector value, without + /// relying on the potential reordering. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. std::optional isGatherShuffledSingleRegisterEntry( const TreeEntry *TE, ArrayRef VL, MutableArrayRef Mask, - SmallVectorImpl &Entries, unsigned Part); + SmallVectorImpl &Entries, unsigned Part, + bool ForOrder); /// Checks if the gathered \p VL can be represented as multi-register /// shuffle(s) of previous tree entries. /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for /// permutations. + /// \param ForOrder Tries to fetch the best candidates for ordering info. Also + /// commands to build the mask using the original vector value, without + /// relying on the potential reordering. /// \returns per-register series of ShuffleKind, if gathered values can be /// represented as shuffles of previous tree entries. \p Mask is filled with /// the shuffle mask (also on per-register base). @@ -2441,7 +2448,7 @@ class BoUpSLP { isGatherShuffledEntry( const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, SmallVectorImpl> &Entries, - unsigned NumParts); + unsigned NumParts, bool ForOrder = false); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -3788,65 +3795,163 @@ static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask, std::optional BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); - unsigned NumScalars = TE.Scalars.size(); + // Try to find subvector extract/insert patterns and reorder only such + // patterns. + SmallVector GatheredScalars(TE.Scalars.begin(), TE.Scalars.end()); + Type *ScalarTy = GatheredScalars.front()->getType(); + int NumScalars = GatheredScalars.size(); + if (!isValidElementType(ScalarTy)) + return std::nullopt; + auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars); + int NumParts = TTI->getNumberOfParts(VecTy); + if (NumParts == 0 || NumParts >= NumScalars) + NumParts = 1; + SmallVector ExtractMask; + SmallVector Mask; + SmallVector> Entries; + SmallVector> ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + SmallVector> GatherShuffles = + isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts, + /*ForOrder=*/true); + // No shuffled operands - ignore. + if (GatherShuffles.empty() && ExtractShuffles.empty()) + return std::nullopt; OrdersType CurrentOrder(NumScalars, NumScalars); - SmallVector Positions; - SmallBitVector UsedPositions(NumScalars); - const TreeEntry *STE = nullptr; - // Try to find all gathered scalars that are gets vectorized in other - // vectorize node. Here we can have only one single tree vector node to - // correctly identify order of the gathered scalars. - for (unsigned I = 0; I < NumScalars; ++I) { - Value *V = TE.Scalars[I]; - if (!isa(V)) - continue; - if (const auto *LocalSTE = getTreeEntry(V)) { - if (!STE) - STE = LocalSTE; - else if (STE != LocalSTE) - // Take the order only from the single vector node. - return std::nullopt; - unsigned Lane = - std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); - if (Lane >= NumScalars) - return std::nullopt; - if (CurrentOrder[Lane] != NumScalars) { - if (Lane != I) + if (GatherShuffles.size() == 1 && + *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(TE.Scalars)) { + // Perfect match in the graph, will reuse the previously vectorized + // node. Cost is 0. + std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0); + return CurrentOrder; + } + auto IsSplatMask = [](ArrayRef Mask) { + int SingleElt = PoisonMaskElem; + return all_of(Mask, [&](int I) { + if (SingleElt == PoisonMaskElem && I != PoisonMaskElem) + SingleElt = I; + return I == PoisonMaskElem || I == SingleElt; + }); + }; + // Exclusive broadcast mask - ignore. + if ((ExtractShuffles.empty() && IsSplatMask(Mask) && + (Entries.size() != 1 || + Entries.front().front()->ReorderIndices.empty())) || + (GatherShuffles.empty() && IsSplatMask(ExtractMask))) + return std::nullopt; + SmallBitVector ShuffledSubMasks(NumParts); + auto TransformMaskToOrder = [&](MutableArrayRef CurrentOrder, + ArrayRef Mask, int PartSz, int NumParts, + function_ref GetVF) { + for (int I : seq(0, NumParts)) { + if (ShuffledSubMasks.test(I)) + continue; + const int VF = GetVF(I); + if (VF == 0) + continue; + MutableArrayRef Slice = CurrentOrder.slice(I * PartSz, PartSz); + // Shuffle of at least 2 vectors - ignore. + if (any_of(Slice, [&](int I) { return I != NumScalars; })) { + std::fill(Slice.begin(), Slice.end(), NumScalars); + ShuffledSubMasks.set(I); + continue; + } + // Try to include as much elements from the mask as possible. + int FirstMin = INT_MAX; + int SecondVecFound = false; + for (int K : seq(0, PartSz)) { + int Idx = Mask[I * PartSz + K]; + if (Idx == PoisonMaskElem) { + Value *V = GatheredScalars[I * PartSz + K]; + if (isConstant(V) && !isa(V)) { + SecondVecFound = true; + break; + } continue; - UsedPositions.reset(CurrentOrder[Lane]); + } + if (Idx < VF) { + if (FirstMin > Idx) + FirstMin = Idx; + } else { + SecondVecFound = true; + break; + } } - // The partial identity (where only some elements of the gather node are - // in the identity order) is good. - CurrentOrder[Lane] = I; - UsedPositions.set(I); - } - } - // Need to keep the order if we have a vector entry and at least 2 scalars or - // the vectorized entry has just 2 scalars. - if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { - auto &&IsIdentityOrder = [NumScalars](ArrayRef CurrentOrder) { - for (unsigned I = 0; I < NumScalars; ++I) - if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) - return false; - return true; - }; - if (IsIdentityOrder(CurrentOrder)) - return OrdersType(); - auto *It = CurrentOrder.begin(); - for (unsigned I = 0; I < NumScalars;) { - if (UsedPositions.test(I)) { - ++I; + FirstMin = (FirstMin / PartSz) * PartSz; + // Shuffle of at least 2 vectors - ignore. + if (SecondVecFound) { + std::fill(Slice.begin(), Slice.end(), NumScalars); + ShuffledSubMasks.set(I); continue; } - if (*It == NumScalars) { - *It = I; - ++I; + for (int K : seq(0, PartSz)) { + int Idx = Mask[I * PartSz + K]; + if (Idx == PoisonMaskElem) + continue; + Idx -= FirstMin; + if (Idx >= PartSz) { + SecondVecFound = true; + break; + } + if (CurrentOrder[I * PartSz + Idx] > + static_cast(I * PartSz + K) && + CurrentOrder[I * PartSz + Idx] != + static_cast(I * PartSz + Idx)) + CurrentOrder[I * PartSz + Idx] = I * PartSz + K; + } + // Shuffle of at least 2 vectors - ignore. + if (SecondVecFound) { + std::fill(Slice.begin(), Slice.end(), NumScalars); + ShuffledSubMasks.set(I); + continue; } - ++It; } - return std::move(CurrentOrder); + }; + int PartSz = NumScalars / NumParts; + if (!ExtractShuffles.empty()) + TransformMaskToOrder( + CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) { + if (!ExtractShuffles[I]) + return 0U; + unsigned VF = 0; + for (unsigned Idx : seq(0, PartSz)) { + int K = I * PartSz + Idx; + if (ExtractMask[K] == PoisonMaskElem) + continue; + if (!TE.ReuseShuffleIndices.empty()) + K = TE.ReuseShuffleIndices[K]; + if (!TE.ReorderIndices.empty()) + K = std::distance(TE.ReorderIndices.begin(), + find(TE.ReorderIndices, K)); + auto *EI = dyn_cast(TE.Scalars[K]); + if (!EI) + continue; + VF = std::max(VF, cast(EI->getVectorOperandType()) + ->getElementCount() + .getKnownMinValue()); + } + return VF; + }); + // Check special corner case - single shuffle of the same entry. + if (GatherShuffles.size() == 1 && NumParts != 1) { + if (ShuffledSubMasks.any()) + return std::nullopt; + PartSz = NumScalars; + NumParts = 1; } - return std::nullopt; + if (!Entries.empty()) + TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) { + if (!GatherShuffles[I]) + return 0U; + return std::max(Entries[I].front()->getVectorFactor(), + Entries[I].back()->getVectorFactor()); + }); + int NumUndefs = + count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; }); + if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) + return std::nullopt; + return std::move(CurrentOrder); } namespace { @@ -4168,9 +4273,59 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because // element 3 is used twice in the second submask. unsigned Sz = TE.Scalars.size(); - if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, - Sz)) + if (TE.State == TreeEntry::NeedToGather) { + if (std::optional CurrentOrder = + findReusedOrderedScalars(TE)) { + SmallVector Mask; + fixupOrderingIndices(*CurrentOrder); + inversePermutation(*CurrentOrder, Mask); + ::addMask(Mask, TE.ReuseShuffleIndices); + OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor()); + unsigned Sz = TE.Scalars.size(); + for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) { + for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz))) + if (Idx != PoisonMaskElem) + Res[Idx + K * Sz] = I + K * Sz; + } + return std::move(Res); + } + } + if (Sz == 2 && TE.getVectorFactor() == 4 && + TTI->getNumberOfParts(FixedVectorType::get( + TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1) return std::nullopt; + if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, + Sz)) { + SmallVector ReorderMask(Sz, PoisonMaskElem); + if (TE.ReorderIndices.empty()) + std::iota(ReorderMask.begin(), ReorderMask.end(), 0); + else + inversePermutation(TE.ReorderIndices, ReorderMask); + ::addMask(ReorderMask, TE.ReuseShuffleIndices); + unsigned VF = ReorderMask.size(); + OrdersType ResOrder(VF, VF); + unsigned NumParts = VF / Sz; + SmallBitVector UsedVals(NumParts); + for (unsigned I = 0; I < VF; I += Sz) { + int Val = PoisonMaskElem; + unsigned UndefCnt = 0; + if (any_of(ArrayRef(ReorderMask).slice(I, Sz), + [&](int Idx) { + if (Val == PoisonMaskElem && Idx != PoisonMaskElem) + Val = Idx; + if (Idx == PoisonMaskElem) + ++UndefCnt; + return Idx != PoisonMaskElem && Idx != Val; + }) || + Val >= static_cast(NumParts) || UsedVals.test(Val) || + UndefCnt > Sz / 2) + return std::nullopt; + UsedVals.set(Val); + for (unsigned K = 0; K < NumParts; ++K) + ResOrder[Val + Sz * K] = I + K; + } + return std::move(ResOrder); + } unsigned VF = TE.getVectorFactor(); // Try build correct order for extractelement instructions. SmallVector ReusedMask(TE.ReuseShuffleIndices.begin(), @@ -4208,7 +4363,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; }); std::advance(It, Sz); } - if (all_of(enumerate(ResOrder), + if (TE.State == TreeEntry::NeedToGather && + all_of(enumerate(ResOrder), [](const auto &Data) { return Data.index() == Data.value(); })) return std::nullopt; // No need to reorder. return std::move(ResOrder); @@ -4298,11 +4454,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { OrdersType CurrentOrder; bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder, /*ResizeAllowed=*/true); - if (Reuse || !CurrentOrder.empty()) { - if (!CurrentOrder.empty()) - fixupOrderingIndices(CurrentOrder); + if (Reuse || !CurrentOrder.empty()) return std::move(CurrentOrder); - } } // If the gather node is and // insertelement poison, v, 0 [+ permute] @@ -4335,8 +4488,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { InstructionCost InsertIdxCost = TTI->getVectorInstrCost( Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx, PoisonValue::get(Ty), *It); - if (InsertFirstCost + PermuteCost < InsertIdxCost) + if (InsertFirstCost + PermuteCost < InsertIdxCost) { + OrdersType Order(Sz, Sz); + Order[Idx] = 0; return std::move(Order); + } } } if (isSplat(TE.Scalars)) @@ -4392,6 +4548,28 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const { std::iota(It, std::next(It, Sz), 0); } +static void combineOrders(MutableArrayRef Order, + ArrayRef SecondaryOrder) { + assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) && + "Expected same size of orders"); + unsigned Sz = Order.size(); + SmallBitVector UsedIndices(Sz); + for (unsigned Idx : seq(0, Sz)) { + if (Order[Idx] != Sz) + UsedIndices.set(Order[Idx]); + } + if (SecondaryOrder.empty()) { + for (unsigned Idx : seq(0, Sz)) + if (Order[Idx] == Sz && !UsedIndices.test(Idx)) + Order[Idx] = Idx; + } else { + for (unsigned Idx : seq(0, Sz)) + if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz && + !UsedIndices.test(SecondaryOrder[Idx])) + Order[Idx] = SecondaryOrder[Idx]; + } +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -4560,18 +4738,46 @@ void BoUpSLP::reorderTopToBottom() { } if (OrdersUses.empty()) continue; + auto IsIdentityOrder = [](ArrayRef Order) { + const unsigned Sz = Order.size(); + for (unsigned Idx : seq(0, Sz)) + if (Idx != Order[Idx] && Order[Idx] != Sz) + return false; + return true; + }; // Choose the most used order. - ArrayRef BestOrder = OrdersUses.front().first; - unsigned Cnt = OrdersUses.front().second; - for (const auto &Pair : drop_begin(OrdersUses)) { - if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + unsigned IdentityCnt = 0; + unsigned FilledIdentityCnt = 0; + OrdersType IdentityOrder(VF, VF); + for (auto &Pair : OrdersUses) { + if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { + if (!Pair.first.empty()) + FilledIdentityCnt += Pair.second; + IdentityCnt += Pair.second; + combineOrders(IdentityOrder, Pair.first); + } + } + MutableArrayRef BestOrder = IdentityOrder; + unsigned Cnt = IdentityCnt; + for (auto &Pair : OrdersUses) { + // Prefer identity order. But, if filled identity found (non-empty order) + // with same number of uses, as the new candidate order, we can choose + // this candidate order. + if (Cnt < Pair.second || + (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt && + Cnt == Pair.second && !BestOrder.empty() && + IsIdentityOrder(BestOrder))) { + combineOrders(Pair.first, BestOrder); BestOrder = Pair.first; Cnt = Pair.second; + } else { + combineOrders(BestOrder, Pair.first); } } // Set order of the user node. - if (BestOrder.empty()) + if (IsIdentityOrder(BestOrder)) continue; + fixupOrderingIndices(BestOrder); SmallVector Mask; inversePermutation(BestOrder, Mask); SmallVector MaskOrder(BestOrder.size(), PoisonMaskElem); @@ -4685,7 +4891,7 @@ bool BoUpSLP::canReorderOperands( void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SetVector OrderedEntries; - DenseMap GathersToOrders; + DenseSet GathersToOrders; // Find all reorderable leaf nodes with the given VF. // Currently the are vectorized loads,extracts without alternate operands + // some gathering of extracts. @@ -4700,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize) || !TE->ReuseShuffleIndices.empty()) - GathersToOrders.try_emplace(TE.get(), *CurrentOrder); + GathersToOrders.insert(TE.get()); } } @@ -4718,7 +4924,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || (TE->State == TreeEntry::NeedToGather && - GathersToOrders.count(TE))) || + GathersToOrders.contains(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || !all_of(drop_begin(TE->UserTreeIndices), [TE](const EdgeInfo &EI) { @@ -4775,9 +4981,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const auto Order = [&]() -> const OrdersType { if (OpTE->State == TreeEntry::NeedToGather || !OpTE->ReuseShuffleIndices.empty()) - return GathersToOrders.find(OpTE)->second; + return getReorderingData(*OpTE, /*TopToBottom=*/false) + .value_or(OrdersType(1)); return OpTE->ReorderIndices; }(); + // The order is partially ordered, skip it in favor of fully non-ordered + // orders. + if (Order.size() == 1) + continue; unsigned NumOps = count_if( Data.second, [OpTE](const std::pair &P) { return P.second == OpTE; @@ -4805,9 +5016,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { (IgnoreReorder && TE->Idx == 0)) return true; if (TE->State == TreeEntry::NeedToGather) { - auto It = GathersToOrders.find(TE); - if (It != GathersToOrders.end()) - return !It->second.empty(); + if (GathersToOrders.contains(TE)) + return !getReorderingData(*TE, /*TopToBottom=*/false) + .value_or(OrdersType(1)) + .empty(); return true; } return false; @@ -4839,21 +5051,49 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { ++Res.first->second; } } - // Choose the best order. - ArrayRef BestOrder = OrdersUses.front().first; - unsigned Cnt = OrdersUses.front().second; - for (const auto &Pair : drop_begin(OrdersUses)) { - if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + if (OrdersUses.empty()) { + for (const std::pair &Op : Data.second) + OrderedEntries.remove(Op.second); + continue; + } + auto IsIdentityOrder = [](ArrayRef Order) { + const unsigned Sz = Order.size(); + for (unsigned Idx : seq(0, Sz)) + if (Idx != Order[Idx] && Order[Idx] != Sz) + return false; + return true; + }; + // Choose the most used order. + unsigned IdentityCnt = 0; + unsigned VF = Data.second.front().second->getVectorFactor(); + OrdersType IdentityOrder(VF, VF); + for (auto &Pair : OrdersUses) { + if (Pair.first.empty() || IsIdentityOrder(Pair.first)) { + IdentityCnt += Pair.second; + combineOrders(IdentityOrder, Pair.first); + } + } + MutableArrayRef BestOrder = IdentityOrder; + unsigned Cnt = IdentityCnt; + for (auto &Pair : OrdersUses) { + // Prefer identity order. But, if filled identity found (non-empty + // order) with same number of uses, as the new candidate order, we can + // choose this candidate order. + if (Cnt < Pair.second) { + combineOrders(Pair.first, BestOrder); BestOrder = Pair.first; Cnt = Pair.second; + } else { + combineOrders(BestOrder, Pair.first); } } - // Set order of the user node (reordering of operands and user nodes). - if (BestOrder.empty()) { + // Set order of the user node. + if (IsIdentityOrder(BestOrder)) { for (const std::pair &Op : Data.second) OrderedEntries.remove(Op.second); continue; } + fixupOrderingIndices(BestOrder); // Erase operands from OrderedEntries list and adjust their orders. VisitedOps.clear(); SmallVector Mask; @@ -7472,6 +7712,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } V1 = Constant::getNullValue( FixedVectorType::get(E->Scalars.front()->getType(), CommonVF)); + // Not identity/broadcast? Try to see if the original vector is better. + if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() && + CommonVF == CommonMask.size() && + any_of(enumerate(CommonMask), + [](const auto &&P) { + return P.value() != PoisonMaskElem && + static_cast(P.value()) != P.index(); + }) && + any_of(CommonMask, + [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) { + SmallVector ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + ::addMask(CommonMask, ReorderMask); + } } else if (V1 && P2.isNull()) { // Shuffle single vector. CommonVF = cast(V1->getType())->getNumElements(); @@ -9433,7 +9687,7 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl &VL, std::optional BoUpSLP::isGatherShuffledSingleRegisterEntry( const TreeEntry *TE, ArrayRef VL, MutableArrayRef Mask, - SmallVectorImpl &Entries, unsigned Part) { + SmallVectorImpl &Entries, unsigned Part, bool ForOrder) { Entries.clear(); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. @@ -9532,6 +9786,21 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( VToTEs.insert(TEPtr); } if (const TreeEntry *VTE = getTreeEntry(V)) { + if (ForOrder) { + if (VTE->State != TreeEntry::Vectorize) { + auto It = MultiNodeScalars.find(V); + if (It == MultiNodeScalars.end()) + continue; + VTE = *It->getSecond().begin(); + // Iterate through all vectorized nodes. + auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) { + return MTE->State == TreeEntry::Vectorize; + }); + if (MIt == It->getSecond().end()) + continue; + VTE = *MIt; + } + } Instruction &LastBundleInst = getLastInstructionInBundle(VTE); if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) continue; @@ -9765,8 +10034,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // scalar in the list. for (const std::pair &Pair : EntryLanes) { unsigned Idx = Part * VL.size() + Pair.second; - Mask[Idx] = Pair.first * VF + - Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + Mask[Idx] = + Pair.first * VF + + (ForOrder ? std::distance( + Entries[Pair.first]->Scalars.begin(), + find(Entries[Pair.first]->Scalars, VL[Pair.second])) + : Entries[Pair.first]->findLaneForValue(VL[Pair.second])); IsIdentity &= Mask[Idx] == Pair.second; } switch (Entries.size()) { @@ -9791,8 +10064,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( SmallVector> BoUpSLP::isGatherShuffledEntry( const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, - SmallVectorImpl> &Entries, - unsigned NumParts) { + SmallVectorImpl> &Entries, unsigned NumParts, + bool ForOrder) { assert(NumParts > 0 && NumParts < VL.size() && "Expected positive number of registers."); Entries.clear(); @@ -9810,7 +10083,8 @@ BoUpSLP::isGatherShuffledEntry( ArrayRef SubVL = VL.slice(Part * SliceSize, SliceSize); SmallVectorImpl &SubEntries = Entries.emplace_back(); std::optional SubRes = - isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part); + isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part, + ForOrder); if (!SubRes) SubEntries.clear(); Res.push_back(SubRes); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll index 8f76b2e54e6c2..44542f32bf145 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -76,10 +76,10 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef ; CHECK-NEXT: [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32> @@ -107,12 +107,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> ; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32> ; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]] @@ -152,12 +152,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef ; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0 ; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer ; CHECK-NEXT: [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> ; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32> ; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]] @@ -166,9 +166,9 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef ; CHECK-NEXT: br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]] ; CHECK: while.end166: ; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3 ; CHECK-NEXT: store i32 [[TMP53]], ptr [[CTT:%.*]], align 4 -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2 ; CHECK-NEXT: store i32 [[TMP54]], ptr [[CFF:%.*]], align 4 ; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1 ; CHECK-NEXT: store i32 [[TMP55]], ptr [[CTF:%.*]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll index 0a68996410448..dc05967af1529 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll @@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[V1:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: br label [[FOR_COND15_PREHEADER:%.*]] ; CHECK: for.cond15.preheader: ; CHECK-NEXT: br label [[IF_END:%.*]] @@ -26,14 +26,15 @@ define i32 @foo(i32 %v1, double %v2) { ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]]) ; CHECK-NEXT: br label [[SW_EPILOG:%.*]] ; CHECK: sw.bb195: ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: do.body: ; CHECK-NEXT: unreachable ; CHECK: sw.epilog: -; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP9]], [[SW_BB]] ] ; CHECK-NEXT: ret i32 undef ; CHECK: if.end.1: ; CHECK-NEXT: br label [[FOR_COND15_1:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index 28af0de171231..95aa40f664c0c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -20,17 +20,17 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 5707e143ad551..89ea15d3ab3bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -143,16 +143,17 @@ define void @gather_2(ptr %mat1, float %0, float %1) { ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP2]], 0.000000e+00 +; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00) +; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00 ; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1 -; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX163]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX2_I_I_I278]], align 4 +; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2 +; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX163]], align 4 +; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4 +; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -183,19 +184,18 @@ define i32 @reorder_indices_1(float %0) { ; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00 +; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4 ; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 ; CHECK-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll index 9c7e8f66c6c6c..cb24a9cefffa2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -68,10 +68,10 @@ define void @pr35497() local_unnamed_addr #0 { ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> ; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer ; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], -; SSE-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], ; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], ; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] ; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 @@ -88,10 +88,10 @@ define void @pr35497() local_unnamed_addr #0 { ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], -; AVX-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> +; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1 +; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], ; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], ; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]] ; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll index c051d909f752e..ec90ca9bc674d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -18,9 +18,9 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> ; SSE2-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) ; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] @@ -28,9 +28,9 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, < ; ; SSE42-LABEL: @reduce_and4( ; SSE42-NEXT: entry: -; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> ; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) ; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] ; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] @@ -92,18 +92,18 @@ entry: define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4_transpose( -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> ; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; ; SSE42-LABEL: @reduce_and4_transpose( -; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> ; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] ; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll index b5533463c3930..1a6ff2385905b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll @@ -17,13 +17,12 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP13]], false +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP12]], false ; CHECK-NEXT: ret i1 [[OP_RDX]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index f65f61975a61f..cd7ad210ca567 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -8,12 +8,11 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> , <8 x float> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: store <16 x float> [[TMP11]], ptr [[TMP5]], align 4 +; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4 ; CHECK-NEXT: ret void ; %2 = getelementptr inbounds float, ptr %p, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll index af606fc3a738b..d3c978412cdde 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll @@ -6,7 +6,7 @@ define void @main(ptr %0) { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[TMP0:%.*]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = fcmp oeq <4 x double> [[TMP7]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index c79e9b94278cd..fb2b653aefc87 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -12,10 +12,10 @@ define void @test() { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> , <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> , <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: @@ -23,12 +23,11 @@ define void @test() { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16 +; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll index 8d1d257820f0c..9e3ba05f88da8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll @@ -9,10 +9,10 @@ define void @foo(ptr %this, ptr %p, i32 %add7) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ADD7:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: switch i32 undef, label [[SW_EPILOG:%.*]] [ -; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] -; CHECK-NEXT: i32 2, label [[SW_BB]] +; CHECK-NEXT: i32 0, label [[SW_BB:%.*]] +; CHECK-NEXT: i32 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[TMP1]], @@ -21,10 +21,11 @@ define void @foo(ptr %this, ptr %p, i32 %add7) { ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: sw.epilog: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> undef, [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 9584a663b2d48..46cca9b078ac6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -182,19 +182,18 @@ define i32 @reorder_indices_1(float %0) { ; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00 +; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4 ; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 ; CHECK-NEXT: ret i32 0 ; From 2685e7eadce08125672f0f6013145ae45b7a5ac3 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Thu, 22 Feb 2024 13:34:00 -0600 Subject: [PATCH 263/351] [lldb][docs] Remove/update docs pointing to unittest2 (#82672) --- lldb/docs/resources/test.rst | 8 ++++---- lldb/docs/testsuite/a-detailed-walkthrough.txt | 9 ++++----- .../Python/lldbsuite/test/README-TestSuite | 14 -------------- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/lldb/docs/resources/test.rst b/lldb/docs/resources/test.rst index 52757864539ea..2b0e9010fe280 100644 --- a/lldb/docs/resources/test.rst +++ b/lldb/docs/resources/test.rst @@ -17,8 +17,8 @@ The LLDB test suite consists of three different kinds of test: the output. * **API tests**: Integration tests that interact with the debugger through the SB API. These are written in Python and use LLDB's ``dotest.py`` testing - framework on top of Python's `unittest2 - `_. + framework on top of Python's `unittest + `_. All three test suites use ``lit`` (`LLVM Integrated Tester `_ ) as the test driver. The test @@ -94,7 +94,7 @@ programs from source, run them, and debug the processes. As mentioned before, ``dotest.py`` is LLDB's testing framework. The implementation is located under ``lldb/packages/Python/lldbsuite``. We have several extensions and custom test primitives on top of what's offered by -`unittest2 `_. Those can be +`unittest `_. Those can be found in `lldbtest.py `_. @@ -146,7 +146,7 @@ the test should be run or not. :: - @expectedFailure(checking_function_name) + @skipTestIfFn(checking_function_name) In addition to providing a lot more flexibility when it comes to writing the test, the API test also allow for much more complex scenarios when it comes to diff --git a/lldb/docs/testsuite/a-detailed-walkthrough.txt b/lldb/docs/testsuite/a-detailed-walkthrough.txt index 57c9dbce3d0ab..8a7043786c190 100644 --- a/lldb/docs/testsuite/a-detailed-walkthrough.txt +++ b/lldb/docs/testsuite/a-detailed-walkthrough.txt @@ -58,16 +58,15 @@ display their output. For brevity, the '-t' output is not included here. Notice the 'expected failures=1' message at the end of the run. This is because of a bug currently in lldb such that setting target.process.output-path to 'stdout.txt' does not have any effect on the redirection of the standard output -of the subsequent launched process. We are using unittest2 (a backport of new -unittest features for Python 2.4-2.6) to decorate (mark) the particular test -method as such: +of the subsequent launched process. We are using unittest to decorate (mark) +the particular test method as such: - @unittest2.expectedFailure + @unittest.expectedFailure # rdar://problem/8435794 # settings set target.process.output-path does not seem to work def test_set_output_path(self): -See http://pypi.python.org/pypi/unittest2 for more details. +See http://docs.python.org/library/unittest.html for more details. Now let's look inside the test method: diff --git a/lldb/packages/Python/lldbsuite/test/README-TestSuite b/lldb/packages/Python/lldbsuite/test/README-TestSuite index f76e836ab777c..388f94da0c409 100644 --- a/lldb/packages/Python/lldbsuite/test/README-TestSuite +++ b/lldb/packages/Python/lldbsuite/test/README-TestSuite @@ -91,20 +91,6 @@ to the Python test suite under the current 'test' directory. Contains platform specific plugin to build binaries with dsym/dwarf debugging info. Other platform specific functionalities may be added in the future. -- unittest2 directory - - Many new features were added to unittest in Python 2.7, including test - discovery. unittest2 allows you to use these features with earlier versions of - Python. - - It currently has unittest2 0.5.1 from http://pypi.python.org/pypi/unittest2. - Version 0.5.1 of unittest2 has feature parity with unittest in Python 2.7 - final. If you want to ensure that your tests run identically under unittest2 - and unittest in Python 2.7 you should use unittest2 0.5.1. - - Later versions of unittest2 include changes in unittest made in Python 3.2 and - onwards after the release of Python 2.7. - - Profiling dotest.py runs I used the following command line thingy to do the profiling on a SnowLeopard From e88c255313872185b8c9738d9fa0e624de1e1bea Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 23 Feb 2024 03:40:39 +0800 Subject: [PATCH 264/351] [InstCombine] Add support for cast instructions in `getFreelyInvertedImpl` (#82451) This patch adds support for cast instructions in `getFreelyInvertedImpl` to enable more optimizations. Alive2: https://alive2.llvm.org/ce/z/F6maEE --- .../InstCombine/InstructionCombining.cpp | 14 +++ llvm/test/Transforms/InstCombine/not.ll | 89 +++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 4af455c37c788..87c8dca7efed8 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2387,6 +2387,20 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses, return NonNull; } + if (match(V, m_SExtLike(m_Value(A)))) { + if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder, + DoesConsume, Depth)) + return Builder ? Builder->CreateSExt(AV, V->getType()) : NonNull; + return nullptr; + } + + if (match(V, m_Trunc(m_Value(A)))) { + if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder, + DoesConsume, Depth)) + return Builder ? Builder->CreateTrunc(AV, V->getType()) : NonNull; + return nullptr; + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll index 3b0e5b4412fbe..f277d13eee930 100644 --- a/llvm/test/Transforms/InstCombine/not.ll +++ b/llvm/test/Transforms/InstCombine/not.ll @@ -769,3 +769,92 @@ entry: %cmp = icmp sle i32 %select, %not.c ret i1 %cmp } + +define i32 @test_sext(i32 %a, i32 %b){ +; CHECK-LABEL: @test_sext( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = sext i1 [[TMP1]] to i32 +; CHECK-NEXT: [[NOT:%.*]] = sub i32 [[TMP2]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[NOT]] +; + %cmp = icmp eq i32 %a, 0 + %sext = sext i1 %cmp to i32 + %add = add i32 %b, %sext + %not = xor i32 %add, -1 + ret i32 %not +} + +define <2 x i32> @test_sext_vec(<2 x i32> %a, <2 x i32> %b){ +; CHECK-LABEL: @test_sext_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[NOT:%.*]] = sub <2 x i32> [[TMP2]], [[B:%.*]] +; CHECK-NEXT: ret <2 x i32> [[NOT]] +; + %cmp = icmp eq <2 x i32> %a, zeroinitializer + %sext = sext <2 x i1> %cmp to <2 x i32> + %add = add <2 x i32> %b, %sext + %not = xor <2 x i32> %add, + ret <2 x i32> %not +} + +define i64 @test_zext_nneg(i32 %c1, i64 %c2, i64 %c3){ +; CHECK-LABEL: @test_zext_nneg( +; CHECK-NEXT: [[DOTNEG:%.*]] = add i64 [[C2:%.*]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[C1:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[C3:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = add i64 [[DOTNEG]], [[TMP2]] +; CHECK-NEXT: ret i64 [[SUB]] +; + %not = xor i32 %c1, -1 + %conv = zext nneg i32 %not to i64 + %add1 = add i64 %c2, -5 + %add2 = add i64 %conv, %c3 + %sub = sub i64 %add1, %add2 + ret i64 %sub +} + +define i8 @test_trunc(i8 %a){ +; CHECK-LABEL: @test_trunc( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i8 [[A:%.*]], 0 +; CHECK-NEXT: [[NOT:%.*]] = sext i1 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[NOT]] +; + %zext = zext i8 %a to i32 + %sub = add nsw i32 %zext, -1 + %shr = ashr i32 %sub, 31 + %conv = trunc i32 %shr to i8 + %not = xor i8 %conv, -1 + ret i8 %not +} + +define <2 x i8> @test_trunc_vec(<2 x i8> %a){ +; CHECK-LABEL: @test_trunc_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i8> [[A:%.*]], zeroinitializer +; CHECK-NEXT: [[NOT:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[NOT]] +; + %zext = zext <2 x i8> %a to <2 x i32> + %sub = add nsw <2 x i32> %zext, + %shr = ashr <2 x i32> %sub, + %conv = trunc <2 x i32> %shr to <2 x i8> + %not = xor <2 x i8> %conv, + ret <2 x i8> %not +} + +; Negative tests + +define i32 @test_zext(i32 %a, i32 %b){ +; CHECK-LABEL: @test_zext( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEXT]], [[B:%.*]] +; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[ADD]], -1 +; CHECK-NEXT: ret i32 [[NOT]] +; + %cmp = icmp eq i32 %a, 0 + %sext = zext i1 %cmp to i32 + %add = add i32 %b, %sext + %not = xor i32 %add, -1 + ret i32 %not +} From 3b20fb336d1191e7b969c30825ca8b9423550902 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Thu, 22 Feb 2024 11:43:11 -0800 Subject: [PATCH 265/351] [bazel] add missing dep after 5b079af169cd04b457465fd7ca31714efeefe6d9 --- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 8d11fb9be188f..09c53c9e8a131 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -613,14 +613,15 @@ libc_support_library( libc_support_library( name = "__support_fixed_point", hdrs = [ - "src/__support/fixed_point/fx_rep.h", "src/__support/fixed_point/fx_bits.h", + "src/__support/fixed_point/fx_rep.h", ], deps = [ ":__support_cpp_bit", ":__support_cpp_type_traits", ":__support_macros_attributes", ":__support_macros_optimization", + ":__support_math_extras", ":llvm_libc_macros_stdfix_macros", ], ) From f5c8e9e53130a628c2c3d25c2cbc308e62d2f3e0 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 22 Feb 2024 19:55:18 +0000 Subject: [PATCH 266/351] LoopVectorize/test: guard pr72969 with asserts (#82653) Follow up on 695a9d8 (LoopVectorize: add test for crash in #72969) to guard pr72969.ll with REQUIRES: asserts, in order to be reasonably confident that it will crash reliably. --- llvm/test/Transforms/LoopVectorize/X86/pr72969.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll index 40633c6c8383b..738f5cbaebea5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll @@ -1,3 +1,4 @@ +; REQUIRES: asserts ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s ; REQUIRES: asserts From c1e9883a813db76c1b108ad715895928bb93f4c2 Mon Sep 17 00:00:00 2001 From: Matthias Gehre <93204396+mgehre-amd@users.noreply.github.com> Date: Thu, 22 Feb 2024 21:16:33 +0100 Subject: [PATCH 267/351] [TOSA] TosaToLinalg: fix int64_t min/max lowering of clamp (#82641) tosa.clamp takes `min`/`max` attributes as i64, so ensure that the lowering to linalg works for the whole range. Co-authored-by: Tiago Trevisan Jost --- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 24 +++++++++---------- .../TosaToLinalg/tosa-to-linalg.mlir | 15 ++++++++++++ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 7eb32ebe3228f..7c477f2e1412b 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -384,23 +384,23 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, if (isa(op) && isa(elementTy)) { auto intTy = cast(elementTy); - int32_t min = static_cast( - cast(op->getAttr("min_int")).getValue().getSExtValue()); - int32_t max = static_cast( - cast(op->getAttr("max_int")).getValue().getSExtValue()); + int64_t min = + cast(op->getAttr("min_int")).getValue().getSExtValue(); + int64_t max = + cast(op->getAttr("max_int")).getValue().getSExtValue(); if (intTy.isUnsignedInteger()) { - min = std::max(min, 0); - max = std::min( + min = std::max(min, (int64_t)0); + max = std::min( max, APInt::getMaxValue(intTy.getIntOrFloatBitWidth()).getSExtValue()); } else { - min = std::max( - min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth()) - .getSExtValue()); - max = std::min( - max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth()) - .getSExtValue()); + min = + std::max(min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth()) + .getSExtValue()); + max = + std::min(max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth()) + .getSExtValue()); } auto minVal = rewriter.create( diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index febe74e876746..1fa783f05f04e 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -759,6 +759,21 @@ func.func @test_i8(%arg0: tensor<1xi8>) -> () { // ----- +// CHECK-LABEL: @test_i64 +func.func @test_i64(%arg0: tensor<1xi64>) -> () { + // CHECK: linalg.generic + // CHECK: ^bb0(%[[ARG1:.+]]: i64, + // CHECK-DAG: %[[C127:.+]] = arith.constant -9223372036854775808 + // CHECK-DAG: %[[C126:.+]] = arith.constant 9223372036854775807 + // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C127]], %[[ARG1]] + // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C126]], %[[LOWER]] + %0 = tosa.clamp %arg0 {min_int = -9223372036854775808 : i64, max_int = 9223372036854775807 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi64>) -> tensor<1xi64> + + return +} + +// ----- + // CHECK-LABEL: @test_clamp_f16 func.func @test_clamp_f16(%arg0: tensor<1xf16>) -> () { // CHECK: linalg.generic From 66f6929fec3ae4770368b60aa1920623ab835f9d Mon Sep 17 00:00:00 2001 From: Chris B Date: Thu, 22 Feb 2024 14:32:24 -0600 Subject: [PATCH 268/351] [HLSL][Doc] Add doc about expected differences (#82395) This document covers expected differences between Clang and the HLSL reference compiler implementations (FXC & DXC). The document is not intended to be exhaustive, but it should be a best effort to cover known cases. This document should document both the behavioral difference and the explanation of why Clang differs. The initail document covers known overload resolution differences. --------- Co-authored-by: S. Bharadwaj Yadavalli --- clang/docs/HLSL/ExpectedDifferences.rst | 110 ++++++++++++++++++++++++ clang/docs/HLSL/HLSLDocs.rst | 1 + 2 files changed, 111 insertions(+) create mode 100644 clang/docs/HLSL/ExpectedDifferences.rst diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst new file mode 100644 index 0000000000000..60001b22dc792 --- /dev/null +++ b/clang/docs/HLSL/ExpectedDifferences.rst @@ -0,0 +1,110 @@ + +Expected Differences vs DXC and FXC +=================================== + +.. contents:: + :local: + +Introduction +============ + +HLSL currently has two reference compilers, the `DirectX Shader Compiler (DXC) +`_ and the +`Effect-Compiler (FXC) `_. +The two reference compilers do not fully agree. Some known disagreements in the +references are tracked on +`DXC's GitHub +`_, +but many more are known to exist. + +HLSL as implemented by Clang will also not fully match either of the reference +implementations, it is instead being written to match the `draft language +specification `_. + +This document is a non-exhaustive collection the known differences between +Clang's implementation of HLSL and the existing reference compilers. + +General Principles +------------------ + +Most of the intended differences between Clang and the earlier reference +compilers are focused on increased consistency and correctness. Both reference +compilers do not always apply language rules the same in all contexts. + +Clang also deviates from the reference compilers by providing different +diagnostics, both in terms of the textual messages and the contexts in which +diagnostics are produced. While striving for a high level of source +compatibility with conforming HLSL code, Clang may produce earlier and more +robust diagnostics for incorrect code or reject code that a reference compiler +incorrectly accepted. + +Language Version +================ + +Clang targets language compatibility for HLSL 2021 as implemented by DXC. +Language features that were removed in earlier versions of HLSL may be added on +a case-by-case basis, but are not planned for the initial implementation. + +Overload Resolution +=================== + +Clang's HLSL implementation adopts C++ overload resolution rules as proposed for +HLSL 202x based on proposal +`0007 `_ +and +`0008 `_. + +Clang's implementation extends standard overload resolution rules to HLSL +library functionality. This causes subtle changes in overload resolution +behavior between Clang and DXC. Some examples include: + +.. code-block:: c++ + + void halfOrInt16(half H); + void halfOrInt16(uint16_t U); + void halfOrInt16(int16_t I); + + void takesDoubles(double, double, double); + + cbuffer CB { + uint U; + int I; + float X, Y, Z; + double3 A, B; + } + + export void call() { + halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads + // Clang: Resolves to halfOrInt16(uint16_t). + halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t). + half H; + #ifndef IGNORE_ERRORS + // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t. + H = asfloat16(I); // DXC: Fails to resolve overload for int. + // Clang: Resolves to asfloat16(int16_t). + H = asfloat16(U); // DXC: Fails to resolve overload for int. + // Clang: Resolves to asfloat16(uint16_t). + #endif + H = asfloat16(0x01); // DXC: Resolves to asfloat16(half). + // Clang: Resolves to asfloat16(uint16_t). + + takesDoubles(X, Y, Z); // Works on all compilers + #ifndef IGNORE_ERRORS + fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double. + // Clang: Resolves to fma(double,double,double). + #endif + + double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation. + // FXC: Expands to compute double dot product with fmul/fadd + // Clang: Resolves to dot(float3, float3), emits conversion warnings. + + } + +.. note:: + + In Clang, a conscious decision was made to exclude the ``dot(vector, vector)`` + overload and allow overload resolution to resolve the + ``vector`` overload. This approach provides ``-Wconversion`` + diagnostic notifying the user of the conversion rather than silently altering + precision relative to the other overloads (as FXC does) or generating code + that will fail validation (as DXC does). diff --git a/clang/docs/HLSL/HLSLDocs.rst b/clang/docs/HLSL/HLSLDocs.rst index 1f232129548d0..97b2425f013b3 100644 --- a/clang/docs/HLSL/HLSLDocs.rst +++ b/clang/docs/HLSL/HLSLDocs.rst @@ -11,6 +11,7 @@ HLSL Design and Implementation .. toctree:: :maxdepth: 1 + ExpectedDifferences HLSLIRReference ResourceTypes EntryFunctions From 847048f497bcdfcfe52f36cba49f07bdbd63cd24 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Thu, 22 Feb 2024 12:37:32 -0800 Subject: [PATCH 269/351] [mlir][Vector] Fix bug in vector xfer op flattening transformation (#81964) It looks like the affine map generated to compute the indices of the collapsed dimensions used the wrong dim size. For indices `[idx0][idx1]` we computed the collapsed index as `idx0*size0 + idx1` instead of `idx0*size1 + idx1`. This led to correctness issues in convolution tests when enabling this transformation internally. --- .../mlir/Dialect/Utils/IndexingUtils.h | 3 ++ mlir/lib/Dialect/Utils/IndexingUtils.cpp | 11 ++++- .../Transforms/VectorTransferOpTransforms.cpp | 41 +++++++++++-------- .../Vector/vector-transfer-flatten.mlir | 32 ++++++++++++++- 4 files changed, 65 insertions(+), 22 deletions(-) diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h index 2453d841f633e..9892253df2bff 100644 --- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h +++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h @@ -257,6 +257,9 @@ SmallVector getI64SubArray(ArrayAttr arrayAttr, unsigned dropFront = 0, std::pair> computeLinearIndex(OpFoldResult sourceOffset, ArrayRef strides, ArrayRef indices); +std::pair> +computeLinearIndex(OpFoldResult sourceOffset, ArrayRef strides, + ArrayRef indices); //===----------------------------------------------------------------------===// // Utilities for decomposing larger shapes diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp index baaa581ab6f22..4c960659d80cb 100644 --- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp +++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp @@ -7,13 +7,12 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Utils/IndexingUtils.h" - +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/MLIRContext.h" #include "llvm/ADT/STLExtras.h" - #include #include @@ -306,6 +305,14 @@ mlir::computeLinearIndex(OpFoldResult sourceOffset, return {expr, values}; } +std::pair> +mlir::computeLinearIndex(OpFoldResult sourceOffset, ArrayRef strides, + ArrayRef indices) { + return computeLinearIndex( + sourceOffset, getAsIndexOpFoldResult(sourceOffset.getContext(), strides), + getAsOpFoldResult(ValueRange(indices))); +} + //===----------------------------------------------------------------------===// // TileOffsetRange //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp index 04e5a816dd91e..0ffef6aabccc1 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp @@ -15,6 +15,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" @@ -577,7 +578,6 @@ class FlattenContiguousRowMajorTransferReadPattern if (transferReadOp.getMask()) return failure(); - SmallVector collapsedIndices; int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank(); // 1. Collapse the source memref @@ -599,12 +599,14 @@ class FlattenContiguousRowMajorTransferReadPattern // 2.2 New indices // If all the collapsed indices are zero then no extra logic is needed. // Otherwise, a new offset/index has to be computed. + SmallVector collapsedIndices; if (failed(checkAndCollapseInnerZeroIndices(transferReadOp.getIndices(), firstDimToCollapse, collapsedIndices))) { - // Copy all the leading indices - collapsedIndices = transferReadOp.getIndices(); - collapsedIndices.resize(firstDimToCollapse); + // Copy all the leading indices. + SmallVector indices = transferReadOp.getIndices(); + collapsedIndices.append(indices.begin(), + indices.begin() + firstDimToCollapse); // Compute the remaining trailing index/offset required for reading from // the collapsed memref: @@ -621,24 +623,26 @@ class FlattenContiguousRowMajorTransferReadPattern // memref<1x86xi32>, vector<2xi32> // one would get the following offset: // %offset = %arg0 * 43 - AffineExpr offsetExpr, idxExpr; - bindSymbols(rewriter.getContext(), offsetExpr, idxExpr); - - int64_t outputRank = transferReadOp.getIndices().size(); - OpFoldResult offset = + OpFoldResult collapsedOffset = rewriter.create(loc, 0).getResult(); - for (int64_t i = firstDimToCollapse; i < outputRank; ++i) { - int64_t dim = dyn_cast(source.getType()).getDimSize(i); - offset = affine::makeComposedFoldedAffineApply( - rewriter, loc, offsetExpr + dim * idxExpr, - {offset, transferReadOp.getIndices()[i]}); - } - if (offset.is()) { - collapsedIndices.push_back(offset.get()); + auto sourceShape = sourceType.getShape(); + auto collapsedStrides = computeSuffixProduct(ArrayRef( + sourceShape.begin() + firstDimToCollapse, sourceShape.end())); + + // Compute the collapsed offset. + ArrayRef indicesToCollapse(indices.begin() + firstDimToCollapse, + indices.end()); + auto &&[collapsedExpr, collapsedVals] = computeLinearIndex( + collapsedOffset, collapsedStrides, indicesToCollapse); + collapsedOffset = affine::makeComposedFoldedAffineApply( + rewriter, loc, collapsedExpr, collapsedVals); + + if (collapsedOffset.is()) { + collapsedIndices.push_back(collapsedOffset.get()); } else { collapsedIndices.push_back(rewriter.create( - loc, *getConstantIntValue(offset))); + loc, *getConstantIntValue(collapsedOffset))); } } @@ -710,6 +714,7 @@ class FlattenContiguousRowMajorTransferWritePattern firstContiguousInnerDim, collapsedIndices))) return failure(); + Value collapsedSource = collapseInnerDims(rewriter, loc, source, firstContiguousInnerDim); MemRefType collapsedSourceType = diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir index 1775b5fa4a346..3b6441d0c9560 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -83,7 +83,7 @@ func.func @transfer_read_dims_mismatch_non_zero_indices( return } -// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 * 43)> +// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)> // CHECK-LABEL: func.func @transfer_read_dims_mismatch_non_zero_indices( // CHECK-SAME: %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index, @@ -92,7 +92,7 @@ func.func @transfer_read_dims_mismatch_non_zero_indices( // CHECK: %[[C_0:.*]] = arith.constant 0 : i32 // CHECK: %[[C_0_IDX:.*]] = arith.constant 0 : index // CHECK: %[[COLLAPSED_IN:.*]] = memref.collapse_shape %[[M_IN]] {{\[}}[0], [1, 2, 3]] : memref<1x43x4x6xi32> into memref<1x1032xi32> -// CHECK: %[[COLLAPSED_IDX:.*]] = affine.apply #[[$ATTR_0]]()[%[[IDX_2]], %[[IDX_1]]] +// CHECK: %[[COLLAPSED_IDX:.*]] = affine.apply #[[$ATTR_0]]()[%[[IDX_1]], %[[IDX_2]]] // CHECK: %[[READ:.*]] = vector.transfer_read %[[COLLAPSED_IN]][%[[C_0_IDX]], %[[COLLAPSED_IDX]]], %[[C_0]] {in_bounds = [true]} : memref<1x1032xi32>, vector<12xi32> // CHECK: %[[COLLAPSED_OUT:.*]] = memref.collapse_shape %[[M_OUT]] {{\[}}[0, 1, 2]] : memref<1x2x6xi32> into memref<12xi32> // CHECK: vector.transfer_write %[[READ]], %[[COLLAPSED_OUT]][%[[C_0_IDX]]] {in_bounds = [true]} : vector<12xi32>, memref<12xi32> @@ -459,3 +459,31 @@ func.func @fold_unit_dims_entirely(%arg0 : vector<8xi32>, // CHECK-128B-LABEL: func @fold_unit_dims_entirely( // CHECK-128B-NOT: memref.collapse_shape + +// ----- + +func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, + %idx0 : index, %idx1 : index) -> vector<2x2xf32> { + %c0 = arith.constant 0 : index + %cst_1 = arith.constant 0.000000e+00 : f32 + %8 = vector.transfer_read %subview[%c0, %idx0, %idx1, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, vector<2x2xf32> + return %8 : vector<2x2xf32> +} + +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)> +// CHECK-LABEL: func.func @regression_non_contiguous_dim_read( +// CHECK: %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>> +// CHECK: %[[APPLY:.*]] = affine.apply #[[$MAP]]() + +// ----- + +func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>, + %subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, + %idx0 : index, %idx1 : index) { + %c0 = arith.constant 0 : index + vector.transfer_write %value, %subview[%c0, %idx0, %idx1, %c0] {in_bounds = [true, true]} : vector<2x2xf32>, memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> + return +} + +// CHECK-LABEL: func.func @unsupported_non_contiguous_dim_write( +// CHECK-NOT: memref.collapse_shape From 91e9e3175268c85f4d0e8828d0d392191c250543 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Thu, 22 Feb 2024 13:47:36 -0700 Subject: [PATCH 270/351] [NewPM/CodeGen] Rewrite pass manager nesting (#81068) Currently the new PM infra for codegen puts everything into a MachineFunctionPassManager. The MachineFunctionPassManager owns both Module passes and MachineFunction passes, and batches adjacent MachineFunction passes like a typical PassManager. The current MachineFunctionAnalysisManager also directly references a module and function analysis manager to get results. The initial argument was that the codegen pipeline is relatively "flat", meaning it's mostly machine function passes with a couple of module passes here and there. However, there are a couple of issues with this as compared to a more structured nesting more like the optimization pipeline. For example, it doesn't allow running function passes then machine function passes on a function and its machine function all at once. It also currently requires the caller to split out the IR passes into one pass manager and the MIR passes into another pass manager. This patch rewrites the new pass manager infra for the codegen pipeline to be more similar to the nesting in the optimization pipeline. Basically, a Function contains a MachineFunction. So we can have Module -> Function -> MachineFunction adaptors. It also rewrites the analysis managers to have inner/outer proxies like the ones in the optimization pipeline. The new pass managers/adaptors/analysis managers can be seen in use in PassManagerTest.cpp. This allows us to consolidate to just having to add to one ModulePassManager when using the codegen pipeline. I haven't added the Function -> MachineFunction adaptor in this patch, but it should be added when we merge AddIRPass/AddMachinePass so that we can run IR and MIR passes on a function before proceeding to the next function. The MachineFunctionProperties infra for MIR verification is still WIP. --- .../include/llvm/CodeGen/MachinePassManager.h | 405 ++++++++++-------- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 108 +++-- llvm/include/llvm/Passes/PassBuilder.h | 15 +- llvm/include/llvm/Target/TargetMachine.h | 10 +- llvm/lib/CodeGen/MachinePassManager.cpp | 183 ++++---- llvm/lib/Passes/PassBuilder.cpp | 48 ++- llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp | 9 +- llvm/lib/Target/X86/X86TargetMachine.h | 7 +- llvm/test/tools/llc/new-pm/pipeline.mir | 3 +- llvm/test/tools/llc/new-pm/start-stop.ll | 7 +- llvm/tools/llc/NewPMDriver.cpp | 85 +--- llvm/unittests/CodeGen/PassManagerTest.cpp | 213 +++------ .../MIR/PassBuilderCallbacksTest.cpp | 216 ++++++---- 13 files changed, 681 insertions(+), 628 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h index a0ad7d7a95a28..7713c55661ccc 100644 --- a/llvm/include/llvm/CodeGen/MachinePassManager.h +++ b/llvm/include/llvm/CodeGen/MachinePassManager.h @@ -25,17 +25,18 @@ #include "llvm/ADT/FunctionExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/PassManagerInternal.h" #include "llvm/Support/Error.h" -#include - namespace llvm { class Module; class Function; class MachineFunction; extern template class AnalysisManager; +using MachineFunctionAnalysisManager = AnalysisManager; /// A CRTP mix-in that provides informational APIs needed for machine passes. /// @@ -46,217 +47,247 @@ struct MachinePassInfoMixin : public PassInfoMixin { // TODO: Add MachineFunctionProperties support. }; -/// An AnalysisManager that also exposes IR analysis results. -class MachineFunctionAnalysisManager : public AnalysisManager { -public: - using Base = AnalysisManager; +namespace detail { +struct MachinePassConcept + : PassConcept { + virtual MachineFunctionProperties getRequiredProperties() const = 0; + virtual MachineFunctionProperties getSetProperties() const = 0; + virtual MachineFunctionProperties getClearedProperties() const = 0; +}; - MachineFunctionAnalysisManager() : FAM(nullptr), MAM(nullptr) {} - MachineFunctionAnalysisManager(FunctionAnalysisManager &FAM, - ModuleAnalysisManager &MAM) - : FAM(&FAM), MAM(&MAM) {} - MachineFunctionAnalysisManager(MachineFunctionAnalysisManager &&) = default; - MachineFunctionAnalysisManager & - operator=(MachineFunctionAnalysisManager &&) = default; +template struct MachinePassModel : MachinePassConcept { + explicit MachinePassModel(PassT Pass) : Pass(std::move(Pass)) {} + // We have to explicitly define all the special member functions because MSVC + // refuses to generate them. + MachinePassModel(const MachinePassModel &Arg) : Pass(Arg.Pass) {} + MachinePassModel(MachinePassModel &&Arg) : Pass(std::move(Arg.Pass)) {} - /// Get the result of an analysis pass for a Function. - /// - /// Runs the analysis if a cached result is not available. - template typename PassT::Result &getResult(Function &F) { - return FAM->getResult(F); + friend void swap(MachinePassModel &LHS, MachinePassModel &RHS) { + using std::swap; + swap(LHS.Pass, RHS.Pass); } - /// Get the cached result of an analysis pass for a Function. - /// - /// This method never runs the analysis. - /// - /// \returns null if there is no cached result. - template - typename PassT::Result *getCachedResult(Function &F) { - return FAM->getCachedResult(F); + MachinePassModel &operator=(MachinePassModel RHS) { + swap(*this, RHS); + return *this; } - /// Get the result of an analysis pass for a Module. - /// - /// Runs the analysis if a cached result is not available. - template typename PassT::Result &getResult(Module &M) { - return MAM->getResult(M); + PreservedAnalyses run(MachineFunction &IR, + MachineFunctionAnalysisManager &AM) override { + return Pass.run(IR, AM); } - /// Get the cached result of an analysis pass for a Module. - /// - /// This method never runs the analysis. - /// - /// \returns null if there is no cached result. - template typename PassT::Result *getCachedResult(Module &M) { - return MAM->getCachedResult(M); + void printPipeline( + raw_ostream &OS, + function_ref MapClassName2PassName) override { + Pass.printPipeline(OS, MapClassName2PassName); } - /// Get the result of an analysis pass for a MachineFunction. - /// - /// Runs the analysis if a cached result is not available. - using Base::getResult; + StringRef name() const override { return PassT::name(); } - /// Get the cached result of an analysis pass for a MachineFunction. - /// - /// This method never runs the analysis. - /// - /// returns null if there is no cached result. - using Base::getCachedResult; - - // FIXME: Add LoopAnalysisManager or CGSCCAnalysisManager if needed. - FunctionAnalysisManager *FAM; - ModuleAnalysisManager *MAM; -}; + template + using has_required_t = decltype(std::declval().isRequired()); + template + static std::enable_if_t::value, bool> + passIsRequiredImpl() { + return T::isRequired(); + } + template + static std::enable_if_t::value, bool> + passIsRequiredImpl() { + return false; + } + bool isRequired() const override { return passIsRequiredImpl(); } + + template + using has_get_required_properties_t = + decltype(std::declval().getRequiredProperties()); + template + static std::enable_if_t::value, + MachineFunctionProperties> + getRequiredPropertiesImpl() { + return PassT::getRequiredProperties(); + } + template + static std::enable_if_t::value, + MachineFunctionProperties> + getRequiredPropertiesImpl() { + return MachineFunctionProperties(); + } + MachineFunctionProperties getRequiredProperties() const override { + return getRequiredPropertiesImpl(); + } -extern template class PassManager; + template + using has_get_set_properties_t = + decltype(std::declval().getSetProperties()); + template + static std::enable_if_t::value, + MachineFunctionProperties> + getSetPropertiesImpl() { + return PassT::getSetProperties(); + } + template + static std::enable_if_t::value, + MachineFunctionProperties> + getSetPropertiesImpl() { + return MachineFunctionProperties(); + } + MachineFunctionProperties getSetProperties() const override { + return getSetPropertiesImpl(); + } -/// MachineFunctionPassManager adds/removes below features to/from the base -/// PassManager template instantiation. -/// -/// - Support passes that implement doInitialization/doFinalization. This is for -/// machine function passes to work on module level constructs. One such pass -/// is AsmPrinter. -/// -/// - Support machine module pass which runs over the module (for example, -/// MachineOutliner). A machine module pass needs to define the method: -/// -/// ```Error run(Module &, MachineFunctionAnalysisManager &)``` -/// -/// FIXME: machine module passes still need to define the usual machine -/// function pass interface, namely, -/// `PreservedAnalyses run(MachineFunction &, -/// MachineFunctionAnalysisManager &)` -/// But this interface wouldn't be executed. It is just a placeholder -/// to satisfy the pass manager type-erased inteface. This -/// special-casing of machine module pass is due to its limited use -/// cases and the unnecessary complexity it may bring to the machine -/// pass manager. -/// -/// - The base class `run` method is replaced by an alternative `run` method. -/// See details below. -/// -/// - Support codegening in the SCC order. Users include interprocedural -/// register allocation (IPRA). -class MachineFunctionPassManager - : public PassManager { - using Base = PassManager; + template + using has_get_cleared_properties_t = + decltype(std::declval().getClearedProperties()); + template + static std::enable_if_t::value, + MachineFunctionProperties> + getClearedPropertiesImpl() { + return PassT::getClearedProperties(); + } + template + static std::enable_if_t::value, + MachineFunctionProperties> + getClearedPropertiesImpl() { + return MachineFunctionProperties(); + } + MachineFunctionProperties getClearedProperties() const override { + return getClearedPropertiesImpl(); + } + PassT Pass; +}; +} // namespace detail + +using MachineFunctionAnalysisManagerModuleProxy = + InnerAnalysisManagerProxy; + +template <> +bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate( + Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &Inv); +extern template class InnerAnalysisManagerProxy; + +extern template class OuterAnalysisManagerProxy; +/// Provide the \c ModuleAnalysisManager to \c Function proxy. +using ModuleAnalysisManagerMachineFunctionProxy = + OuterAnalysisManagerProxy; + +class FunctionAnalysisManagerMachineFunctionProxy + : public AnalysisInfoMixin { public: - MachineFunctionPassManager(bool RequireCodeGenSCCOrder = false, - bool VerifyMachineFunction = false) - : RequireCodeGenSCCOrder(RequireCodeGenSCCOrder), - VerifyMachineFunction(VerifyMachineFunction) {} - MachineFunctionPassManager(MachineFunctionPassManager &&) = default; - MachineFunctionPassManager & - operator=(MachineFunctionPassManager &&) = default; - - /// Run machine passes for a Module. + class Result { + public: + explicit Result(FunctionAnalysisManager &FAM) : FAM(&FAM) {} + + Result(Result &&Arg) : FAM(std::move(Arg.FAM)) { + // We have to null out the analysis manager in the moved-from state + // because we are taking ownership of the responsibilty to clear the + // analysis state. + Arg.FAM = nullptr; + } + + ~Result() { + // FAM is cleared in a moved from state where there is nothing to do. + if (!FAM) + return; + + // Clear out the analysis manager if we're being destroyed -- it means we + // didn't even see an invalidate call when we got invalidated. + FAM->clear(); + } + + Result &operator=(Result &&RHS) { + FAM = RHS.FAM; + // We have to null out the analysis manager in the moved-from state + // because we are taking ownership of the responsibilty to clear the + // analysis state. + RHS.FAM = nullptr; + return *this; + } + + /// Accessor for the analysis manager. + FunctionAnalysisManager &getManager() { return *FAM; } + + /// Handler for invalidation of the outer IR unit, \c IRUnitT. + /// + /// If the proxy analysis itself is not preserved, we assume that the set of + /// inner IR objects contained in IRUnit may have changed. In this case, + /// we have to call \c clear() on the inner analysis manager, as it may now + /// have stale pointers to its inner IR objects. + /// + /// Regardless of whether the proxy analysis is marked as preserved, all of + /// the analyses in the inner analysis manager are potentially invalidated + /// based on the set of preserved analyses. + bool invalidate(MachineFunction &IR, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv); + + private: + FunctionAnalysisManager *FAM; + }; + + explicit FunctionAnalysisManagerMachineFunctionProxy( + FunctionAnalysisManager &FAM) + : FAM(&FAM) {} + + /// Run the analysis pass and create our proxy result object. /// - /// The intended use is to start the codegen pipeline for a Module. The base - /// class's `run` method is deliberately hidden by this due to the observation - /// that we don't yet have the use cases of compositing two instances of - /// machine pass managers, or compositing machine pass managers with other - /// types of pass managers. - Error run(Module &M, MachineFunctionAnalysisManager &MFAM); - - template void addPass(PassT &&Pass) { - Base::addPass(std::forward(Pass)); - PassConceptT *P = Passes.back().get(); - addDoInitialization(P); - addDoFinalization(P); - - // Add machine module pass. - addRunOnModule(P); + /// This doesn't do any interesting work; it is primarily used to insert our + /// proxy result object into the outer analysis cache so that we can proxy + /// invalidation to the inner analysis manager. + Result run(MachineFunction &, MachineFunctionAnalysisManager &) { + return Result(*FAM); } -private: - template - using has_init_t = decltype(std::declval().doInitialization( - std::declval(), - std::declval())); - - template - std::enable_if_t::value> - addDoInitialization(PassConceptT *Pass) {} - - template - std::enable_if_t::value> - addDoInitialization(PassConceptT *Pass) { - using PassModelT = detail::PassModel; - auto *P = static_cast(Pass); - InitializationFuncs.emplace_back( - [=](Module &M, MachineFunctionAnalysisManager &MFAM) { - return P->Pass.doInitialization(M, MFAM); - }); - } + static AnalysisKey Key; - template - using has_fini_t = decltype(std::declval().doFinalization( - std::declval(), - std::declval())); - - template - std::enable_if_t::value> - addDoFinalization(PassConceptT *Pass) {} - - template - std::enable_if_t::value> - addDoFinalization(PassConceptT *Pass) { - using PassModelT = detail::PassModel; - auto *P = static_cast(Pass); - FinalizationFuncs.emplace_back( - [=](Module &M, MachineFunctionAnalysisManager &MFAM) { - return P->Pass.doFinalization(M, MFAM); - }); - } +private: + FunctionAnalysisManager *FAM; +}; - template - using is_machine_module_pass_t = decltype(std::declval().run( - std::declval(), - std::declval())); - - template - using is_machine_function_pass_t = decltype(std::declval().run( - std::declval(), - std::declval())); - - template - std::enable_if_t::value> - addRunOnModule(PassConceptT *Pass) {} - - template - std::enable_if_t::value> - addRunOnModule(PassConceptT *Pass) { - static_assert(is_detected::value, - "machine module pass needs to define machine function pass " - "api. sorry."); - - using PassModelT = detail::PassModel; - auto *P = static_cast(Pass); - MachineModulePasses.emplace( - Passes.size() - 1, - [=](Module &M, MachineFunctionAnalysisManager &MFAM) { - return P->Pass.run(M, MFAM); - }); - } +class ModuleToMachineFunctionPassAdaptor + : public PassInfoMixin { + using MachinePassConcept = detail::MachinePassConcept; - using FuncTy = Error(Module &, MachineFunctionAnalysisManager &); - SmallVector, 4> InitializationFuncs; - SmallVector, 4> FinalizationFuncs; +public: + explicit ModuleToMachineFunctionPassAdaptor( + std::unique_ptr Pass) + : Pass(std::move(Pass)) {} - using PassIndex = decltype(Passes)::size_type; - std::map> MachineModulePasses; + /// Runs the function pass across every function in the module. + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + void printPipeline(raw_ostream &OS, + function_ref MapClassName2PassName); - // Run codegen in the SCC order. - bool RequireCodeGenSCCOrder; + static bool isRequired() { return true; } - bool VerifyMachineFunction; +private: + std::unique_ptr Pass; }; +template +ModuleToMachineFunctionPassAdaptor +createModuleToMachineFunctionPassAdaptor(MachineFunctionPassT &&Pass) { + using PassModelT = detail::MachinePassModel; + // Do not use make_unique, it causes too many template instantiations, + // causing terrible compile times. + return ModuleToMachineFunctionPassAdaptor( + std::unique_ptr( + new PassModelT(std::forward(Pass)))); +} + +template <> +PreservedAnalyses +PassManager::run(MachineFunction &, + AnalysisManager &); +extern template class PassManager; + +/// Convenience typedef for a pass manager over functions. +using MachineFunctionPassManager = PassManager; + } // end namespace llvm #endif // LLVM_CODEGEN_MACHINEPASSMANAGER_H diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 80bbfb75185a9..dc60727729f73 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -37,6 +37,7 @@ #include "llvm/CodeGen/InterleavedLoadCombine.h" #include "llvm/CodeGen/JMCInstrumenter.h" #include "llvm/CodeGen/LowerEmuTLS.h" +#include "llvm/CodeGen/MIRPrinter.h" #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/ReplaceWithVeclib.h" @@ -88,12 +89,8 @@ namespace llvm { #define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME) \ struct PASS_NAME : public MachinePassInfoMixin { \ template PASS_NAME(Ts &&...) {} \ - Error run(Module &, MachineFunctionAnalysisManager &) { \ - return Error::success(); \ - } \ - PreservedAnalyses run(MachineFunction &, \ - MachineFunctionAnalysisManager &) { \ - llvm_unreachable("this api is to make new PM api happy"); \ + PreservedAnalyses run(Module &, ModuleAnalysisManager &) { \ + return PreservedAnalyses::all(); \ } \ }; #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME) \ @@ -132,8 +129,8 @@ template class CodeGenPassBuilder { Opt.OptimizeRegAlloc = getOptLevel() != CodeGenOptLevel::None; } - Error buildPipeline(ModulePassManager &MPM, MachineFunctionPassManager &MFPM, - raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, + Error buildPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out, + raw_pwrite_stream *DwoOut, CodeGenFileType FileType) const; PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const { @@ -149,7 +146,15 @@ template class CodeGenPassBuilder { using is_function_pass_t = decltype(std::declval().run( std::declval(), std::declval())); + template + using is_machine_function_pass_t = decltype(std::declval().run( + std::declval(), + std::declval())); + // Function object to maintain state while adding codegen IR passes. + // TODO: add a Function -> MachineFunction adaptor and merge + // AddIRPass/AddMachinePass so we can have a function pipeline that runs both + // function passes and machine function passes. class AddIRPass { public: AddIRPass(ModulePassManager &MPM, const DerivedT &PB) : MPM(MPM), PB(PB) {} @@ -196,31 +201,47 @@ template class CodeGenPassBuilder { // Function object to maintain state while adding codegen machine passes. class AddMachinePass { public: - AddMachinePass(MachineFunctionPassManager &PM, const DerivedT &PB) - : PM(PM), PB(PB) {} + AddMachinePass(ModulePassManager &MPM, const DerivedT &PB) + : MPM(MPM), PB(PB) {} + ~AddMachinePass() { + if (!MFPM.isEmpty()) + MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); + } + + template + void operator()(PassT &&Pass, bool Force = false, + StringRef Name = PassT::name()) { + static_assert((is_detected::value || + is_detected::value) && + "Only module pass and function pass are supported."); - template void operator()(PassT &&Pass) { - if (!PB.runBeforeAdding(PassT::name())) + if (!Force && !PB.runBeforeAdding(Name)) return; - PM.addPass(std::forward(Pass)); + // Add Function Pass + if constexpr (is_detected::value) { + MFPM.addPass(std::forward(Pass)); - for (auto &C : PB.AfterCallbacks) - C(PassT::name()); - } + for (auto &C : PB.AfterCallbacks) + C(Name); + } else { + // Add Module Pass + if (!MFPM.isEmpty()) { + MPM.addPass( + createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); + MFPM = MachineFunctionPassManager(); + } - template void insertPass(StringRef PassName, PassT Pass) { - PB.AfterCallbacks.emplace_back( - [this, PassName, Pass = std::move(Pass)](StringRef Name) { - if (PassName == Name) - this->PM.addPass(std::move(Pass)); - }); - } + MPM.addPass(std::forward(Pass)); - MachineFunctionPassManager releasePM() { return std::move(PM); } + for (auto &C : PB.AfterCallbacks) + C(Name); + } + } private: - MachineFunctionPassManager &PM; + ModulePassManager &MPM; + MachineFunctionPassManager MFPM; const DerivedT &PB; }; @@ -467,30 +488,43 @@ template class CodeGenPassBuilder { template Error CodeGenPassBuilder::buildPipeline( - ModulePassManager &MPM, MachineFunctionPassManager &MFPM, - raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, + ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType) const { auto StartStopInfo = TargetPassConfig::getStartStopInfo(*PIC); if (!StartStopInfo) return StartStopInfo.takeError(); setStartStopPasses(*StartStopInfo); - AddIRPass addIRPass(MPM, derived()); - // `ProfileSummaryInfo` is always valid. - addIRPass(RequireAnalysisPass()); - addIRPass(RequireAnalysisPass()); - addISelPasses(addIRPass); - AddMachinePass addPass(MFPM, derived()); + bool PrintAsm = TargetPassConfig::willCompleteCodeGenPipeline(); + bool PrintMIR = !PrintAsm && FileType != CodeGenFileType::Null; + + { + AddIRPass addIRPass(MPM, derived()); + addIRPass(RequireAnalysisPass()); + addIRPass(RequireAnalysisPass()); + addISelPasses(addIRPass); + } + + AddMachinePass addPass(MPM, derived()); + + if (PrintMIR) + addPass(PrintMIRPreparePass(Out), /*Force=*/true); + if (auto Err = addCoreISelPasses(addPass)) return std::move(Err); if (auto Err = derived().addMachinePasses(addPass)) return std::move(Err); - derived().addAsmPrinter( - addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) { - return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx); - }); + if (PrintAsm) { + derived().addAsmPrinter( + addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) { + return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx); + }); + } + + if (PrintMIR) + addPass(PrintMIRPass(Out), /*Force=*/true); addPass(FreeMachineFunctionPass()); return verifyStartStop(*StartStopInfo); diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 10c5b7c00bae3..6822cfdb4957b 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -133,7 +133,8 @@ class PassBuilder { void crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM, CGSCCAnalysisManager &CGAM, - ModuleAnalysisManager &MAM); + ModuleAnalysisManager &MAM, + MachineFunctionAnalysisManager *MFAM = nullptr); /// Registers all available module analysis passes. /// @@ -569,9 +570,9 @@ class PassBuilder { ModulePipelineParsingCallbacks.push_back(C); } void registerPipelineParsingCallback( - const std::function - &C) { - MachinePipelineParsingCallbacks.push_back(C); + const std::function)> &C) { + MachineFunctionPipelineParsingCallbacks.push_back(C); } /// @}} @@ -733,8 +734,10 @@ class PassBuilder { // Machine pass callbackcs SmallVector, 2> MachineFunctionAnalysisRegistrationCallbacks; - SmallVector, 2> - MachinePipelineParsingCallbacks; + SmallVector)>, + 2> + MachineFunctionPipelineParsingCallbacks; }; /// This utility template takes care of adding require<> and invalidate<> diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index 7462f61d32b56..d7ce088cad49f 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -34,8 +34,6 @@ using ModulePassManager = PassManager; class Function; class GlobalValue; -class MachineFunctionPassManager; -class MachineFunctionAnalysisManager; class MachineModuleInfoWrapperPass; class Mangler; class MCAsmInfo; @@ -455,11 +453,9 @@ class LLVMTargetMachine : public TargetMachine { bool DisableVerify = true, MachineModuleInfoWrapperPass *MMIWP = nullptr) override; - virtual Error buildCodeGenPipeline(ModulePassManager &, - MachineFunctionPassManager &, - MachineFunctionAnalysisManager &, - raw_pwrite_stream &, raw_pwrite_stream *, - CodeGenFileType, CGPassBuilderOption, + virtual Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &, + raw_pwrite_stream *, CodeGenFileType, + CGPassBuilderOption, PassInstrumentationCallbacks *) { return make_error("buildCodeGenPipeline is not overridden", inconvertibleErrorCode()); diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp index d42bbe239830f..9a750b5bed433 100644 --- a/llvm/lib/CodeGen/MachinePassManager.cpp +++ b/llvm/lib/CodeGen/MachinePassManager.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachinePassManager.h" -#include "llvm/CodeGen/FreeMachineFunction.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/PassManagerImpl.h" @@ -19,99 +18,121 @@ using namespace llvm; namespace llvm { -template class AllAnalysesOn; + +AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key; + template class AnalysisManager; template class PassManager; +template class InnerAnalysisManagerProxy; +template class OuterAnalysisManagerProxy; + +bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate( + MachineFunction &IR, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv) { + // MachineFunction passes should not invalidate Function analyses. + // TODO: verify that PA doesn't invalidate Function analyses. + return false; +} -Error MachineFunctionPassManager::run(Module &M, - MachineFunctionAnalysisManager &MFAM) { - // MachineModuleAnalysis is a module analysis pass that is never invalidated - // because we don't run any module pass in codegen pipeline. This is very - // important because the codegen state is stored in MMI which is the analysis - // result of MachineModuleAnalysis. MMI should not be recomputed. - auto &MMI = MFAM.getResult(M).getMMI(); - - (void)RequireCodeGenSCCOrder; - assert(!RequireCodeGenSCCOrder && "not implemented"); - - // M is unused here - PassInstrumentation PI = MFAM.getResult(M); - - // Add a PIC to verify machine functions. - if (VerifyMachineFunction) { - // No need to pop this callback later since MIR pipeline is flat which means - // current pipeline is the top-level pipeline. Callbacks are not used after - // current pipeline. - PI.pushBeforeNonSkippedPassCallback([](StringRef PassID, Any IR) { - assert(llvm::any_cast(&IR)); - const MachineFunction *MF = llvm::any_cast(IR); - assert(MF && "Machine function should be valid for printing"); - std::string Banner = std::string("After ") + std::string(PassID); - verifyMachineFunction(Banner, *MF); - }); +template <> +bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate( + Module &M, const PreservedAnalyses &PA, + ModuleAnalysisManager::Invalidator &Inv) { + // If literally everything is preserved, we're done. + if (PA.areAllPreserved()) + return false; // This is still a valid proxy. + + // If this proxy isn't marked as preserved, then even if the result remains + // valid, the key itself may no longer be valid, so we clear everything. + // + // Note that in order to preserve this proxy, a module pass must ensure that + // the MFAM has been completely updated to handle the deletion of functions. + // Specifically, any MFAM-cached results for those functions need to have been + // forcibly cleared. When preserved, this proxy will only invalidate results + // cached on functions *still in the module* at the end of the module pass. + auto PAC = PA.getChecker(); + if (!PAC.preserved() && !PAC.preservedSet>()) { + InnerAM->clear(); + return true; } - for (auto &F : InitializationFuncs) { - if (auto Err = F(M, MFAM)) - return Err; + // FIXME: be more precise, see + // FunctionAnalysisManagerModuleProxy::Result::invalidate. + if (!PA.allAnalysesInSetPreserved>()) { + InnerAM->clear(); + return true; } - unsigned Idx = 0; - size_t Size = Passes.size(); - do { - // Run machine module passes - for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) { - if (!PI.runBeforePass(*Passes[Idx], M)) - continue; - if (auto Err = MachineModulePasses.at(Idx)(M, MFAM)) - return Err; - PI.runAfterPass(*Passes[Idx], M, PreservedAnalyses::all()); - } - - // Finish running all passes. - if (Idx == Size) - break; - - // Run machine function passes - - // Get index range of machine function passes. - unsigned Begin = Idx; - for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx) - ; - - for (Function &F : M) { - // Do not codegen any 'available_externally' functions at all, they have - // definitions outside the translation unit. - if (F.hasAvailableExternallyLinkage()) - continue; - - MachineFunction &MF = MMI.getOrCreateMachineFunction(F); - - for (unsigned I = Begin, E = Idx; I != E; ++I) { - auto *P = Passes[I].get(); + // Return false to indicate that this result is still a valid proxy. + return false; +} - if (!PI.runBeforePass(*P, MF)) - continue; +PreservedAnalyses +ModuleToMachineFunctionPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) { + auto &MMI = AM.getResult(M).getMMI(); + MachineFunctionAnalysisManager &MFAM = + AM.getResult(M).getManager(); + PassInstrumentation PI = AM.getResult(M); + PreservedAnalyses PA = PreservedAnalyses::all(); + for (Function &F : M) { + // Do not codegen any 'available_externally' functions at all, they have + // definitions outside the translation unit. + if (F.hasAvailableExternallyLinkage()) + continue; + + MachineFunction &MF = MMI.getOrCreateMachineFunction(F); + + if (!PI.runBeforePass(*Pass, MF)) + continue; + PreservedAnalyses PassPA = Pass->run(MF, MFAM); + if (MMI.getMachineFunction(F)) { + MFAM.invalidate(MF, PassPA); + PI.runAfterPass(*Pass, MF, PassPA); + } else { + MFAM.clear(MF, F.getName()); + PI.runAfterPassInvalidated(*Pass, PassPA); + } + PA.intersect(std::move(PassPA)); + } - // TODO: EmitSizeRemarks - PreservedAnalyses PassPA = P->run(MF, MFAM); + return PA; +} - // MF is dangling after FreeMachineFunctionPass - if (P->name() != FreeMachineFunctionPass::name()) { - MFAM.invalidate(MF, PassPA); +void ModuleToMachineFunctionPassAdaptor::printPipeline( + raw_ostream &OS, function_ref MapClassName2PassName) { + OS << "machine-function("; + Pass->printPipeline(OS, MapClassName2PassName); + OS << ')'; +} - PI.runAfterPass(*P, MF, PassPA); - } - } +template <> +PreservedAnalyses +PassManager::run(MachineFunction &MF, + AnalysisManager &MFAM) { + PassInstrumentation PI = MFAM.getResult(MF); + Function &F = MF.getFunction(); + MachineModuleInfo &MMI = + MFAM.getResult(MF) + .getCachedResult(*F.getParent()) + ->getMMI(); + PreservedAnalyses PA = PreservedAnalyses::all(); + for (auto &Pass : Passes) { + if (!PI.runBeforePass(*Pass, MF)) + continue; + + PreservedAnalyses PassPA = Pass->run(MF, MFAM); + if (MMI.getMachineFunction(F)) { + MFAM.invalidate(MF, PassPA); + PI.runAfterPass(*Pass, MF, PassPA); + } else { + MFAM.clear(MF, F.getName()); + PI.runAfterPassInvalidated(*Pass, PassPA); } - } while (true); - - for (auto &F : FinalizationFuncs) { - if (auto Err = F(M, MFAM)) - return Err; + PA.intersect(std::move(PassPA)); } - - return Error::success(); + return PA; } } // namespace llvm diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f26d95ab1e479..fed7a14c8a2e3 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -91,6 +91,7 @@ #include "llvm/CodeGen/JMCInstrumenter.h" #include "llvm/CodeGen/LowerEmuTLS.h" #include "llvm/CodeGen/MIRPrinter.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/SafeStack.h" #include "llvm/CodeGen/SelectOptimize.h" #include "llvm/CodeGen/ShadowStackGCLowering.h" @@ -1259,6 +1260,28 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) { return callbacksAcceptPassName(Name, Callbacks); } +template +static bool isMachineFunctionPassName(StringRef Name, CallbacksT &Callbacks) { + // Explicitly handle pass manager names. + if (Name == "machine-function") + return true; + + // Explicitly handle custom-parsed pass names. + if (parseRepeatPassName(Name)) + return true; + +#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) \ + return true; +#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ + if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">") \ + return true; + +#include "llvm/Passes/MachinePassRegistry.def" + + return callbacksAcceptPassName(Name, Callbacks); +} + template static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks, bool &UseMemorySSA) { @@ -1394,6 +1417,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); return Error::success(); } + if (Name == "machine-function") { + MachineFunctionPassManager MFPM; + if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline)) + return Err; + MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); + return Error::success(); + } if (auto Params = parseFunctionPipelineName(Name)) { if (Params->second) return make_error( @@ -1874,8 +1904,8 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM, } #include "llvm/Passes/MachinePassRegistry.def" - for (auto &C : MachinePipelineParsingCallbacks) - if (C(Name, MFPM)) + for (auto &C : MachineFunctionPipelineParsingCallbacks) + if (C(Name, MFPM, E.InnerPipeline)) return Error::success(); return make_error( formatv("unknown machine pass '{0}'", Name).str(), @@ -1942,7 +1972,8 @@ Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM, void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM, CGSCCAnalysisManager &CGAM, - ModuleAnalysisManager &MAM) { + ModuleAnalysisManager &MAM, + MachineFunctionAnalysisManager *MFAM) { MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); }); MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); }); CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); }); @@ -1950,6 +1981,14 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); }); FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); }); LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); + if (MFAM) { + MAM.registerPass( + [&] { return MachineFunctionAnalysisManagerModuleProxy(*MFAM); }); + MFAM->registerPass( + [&] { return ModuleAnalysisManagerMachineFunctionProxy(MAM); }); + MFAM->registerPass( + [&] { return FunctionAnalysisManagerMachineFunctionProxy(FAM); }); + } } Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, @@ -1991,6 +2030,9 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM, UseMemorySSA)) { Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop", std::move(*Pipeline)}}}}; + } else if (isMachineFunctionPassName( + FirstName, MachineFunctionPipelineParsingCallbacks)) { + Pipeline = {{"machine-function", std::move(*Pipeline)}}; } else { for (auto &C : TopLevelPipelineParsingCallbacks) if (C(MPM, *Pipeline)) diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index 4a11dd2e31acd..a620ba911ec61 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const { } // namespace Error X86TargetMachine::buildCodeGenPipeline( - ModulePassManager &MPM, MachineFunctionPassManager &MFPM, - MachineFunctionAnalysisManager &, raw_pwrite_stream &Out, - raw_pwrite_stream *DwoOut, CodeGenFileType FileType, - CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) { + ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, + CodeGenFileType FileType, CGPassBuilderOption Opt, + PassInstrumentationCallbacks *PIC) { auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC); - return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType); + return CGPB.buildPipeline(MPM, Out, DwoOut, FileType); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index f31c971df9584..0fd3e47aaefe7 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -58,10 +58,9 @@ class X86TargetMachine final : public LLVMTargetMachine { createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; - Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &, - MachineFunctionAnalysisManager &, - raw_pwrite_stream &, raw_pwrite_stream *, - CodeGenFileType, CGPassBuilderOption, + Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &, + raw_pwrite_stream *, CodeGenFileType, + CGPassBuilderOption, PassInstrumentationCallbacks *) override; bool isJIT() const { return IsJIT; } diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir index c7dda4b6d1356..fcc7d4f8f02e3 100644 --- a/llvm/test/tools/llc/new-pm/pipeline.mir +++ b/llvm/test/tools/llc/new-pm/pipeline.mir @@ -1,7 +1,6 @@ # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines -# CHECK: IR pipeline: PrintMIRPreparePass -# CHECK: MIR pipeline: no-op-machine-function,print,FreeMachineFunctionPass +# CHECK: machine-function(no-op-machine-function),PrintMIRPreparePass,machine-function(print,FreeMachineFunctionPass) --- name: f diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll index c25e45d1f7ab9..8c795a7a70f81 100644 --- a/llvm/test/tools/llc/new-pm/start-stop.ll +++ b/llvm/test/tools/llc/new-pm/start-stop.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s - -; CHECK: IR pipeline: function(mergeicmps,expand-memcmp,gc-lowering) +; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s --check-prefix=NULL +; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ +; NULL: function(mergeicmps,expand-memcmp,gc-lowering) +; OBJ: function(mergeicmps,expand-memcmp,gc-lowering),PrintMIRPreparePass,machine-function(print) diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp index c3288ef9d0808..6ae1b8db5e115 100644 --- a/llvm/tools/llc/NewPMDriver.cpp +++ b/llvm/tools/llc/NewPMDriver.cpp @@ -89,30 +89,6 @@ bool LLCDiagnosticHandler::handleDiagnostics(const DiagnosticInfo &DI) { static llvm::ExitOnError ExitOnErr; -static void RunPasses(bool BOS, ToolOutputFile *Out, Module *M, - LLVMContext &Context, SmallString<0> &Buffer, - ModulePassManager *MPM, ModuleAnalysisManager *MAM, - MachineFunctionPassManager &MFPM, - MachineFunctionAnalysisManager &MFAM) { - assert(M && "invalid input module!"); - - // Before executing passes, print the final values of the LLVM options. - cl::PrintOptionValues(); - - if (MPM) { - assert(MAM && "expect a ModuleAnalysisManager!"); - MPM->run(*M, *MAM); - } - - ExitOnErr(MFPM.run(*M, MFAM)); - - if (Context.getDiagHandlerPtr()->HasErrors) - exit(1); - - if (BOS) - Out->os() << Buffer; -} - int llvm::compileModuleWithNewPM( StringRef Arg0, std::unique_ptr M, std::unique_ptr MIR, std::unique_ptr Target, std::unique_ptr Out, @@ -131,16 +107,6 @@ int llvm::compileModuleWithNewPM( raw_pwrite_stream *OS = &Out->os(); - // Manually do the buffering rather than using buffer_ostream, - // so we can memcmp the contents in CompileTwice mode in future. - SmallString<0> Buffer; - std::unique_ptr BOS; - if ((codegen::getFileType() != CodeGenFileType::AssemblyFile && - !Out->os().supportsSeeking())) { - BOS = std::make_unique(Buffer); - OS = BOS.get(); - } - // Fetch options from TargetPassConfig CGPassBuilderOption Opt = getCGPassBuilderOption(); Opt.DisableVerify = NoVerify; @@ -158,20 +124,19 @@ int llvm::compileModuleWithNewPM( FunctionAnalysisManager FAM; CGSCCAnalysisManager CGAM; ModuleAnalysisManager MAM; + MachineFunctionAnalysisManager MFAM; PassBuilder PB(Target.get(), PipelineTuningOptions(), std::nullopt, &PIC); PB.registerModuleAnalyses(MAM); PB.registerCGSCCAnalyses(CGAM); PB.registerFunctionAnalyses(FAM); PB.registerLoopAnalyses(LAM); - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + PB.registerMachineFunctionAnalyses(MFAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); FAM.registerPass([&] { return TargetLibraryAnalysis(TLII); }); MAM.registerPass([&] { return MachineModuleAnalysis(MMI); }); - MachineFunctionAnalysisManager MFAM(FAM, MAM); - ModulePassManager MPM; - MachineFunctionPassManager MFPM; if (!PassPipeline.empty()) { // Construct a custom pass pipeline that starts after instruction @@ -182,49 +147,39 @@ int llvm::compileModuleWithNewPM( return 1; } - ExitOnErr(PB.parsePassPipeline(MFPM, PassPipeline)); + // FIXME: verify that there are no IR passes. + ExitOnErr(PB.parsePassPipeline(MPM, PassPipeline)); MPM.addPass(PrintMIRPreparePass(*OS)); + MachineFunctionPassManager MFPM; MFPM.addPass(PrintMIRPass(*OS)); MFPM.addPass(FreeMachineFunctionPass()); + MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); - auto &MMI = MFAM.getResult(*M).getMMI(); if (MIR->parseMachineFunctions(*M, MMI)) return 1; } else { - ExitOnErr(LLVMTM.buildCodeGenPipeline(MPM, MFPM, MFAM, *OS, - DwoOut ? &DwoOut->os() : nullptr, - FileType, Opt, &PIC)); - - auto StartStopInfo = TargetPassConfig::getStartStopInfo(PIC); - assert(StartStopInfo && "Expect StartStopInfo!"); - - if (auto StopPassName = StartStopInfo->StopPass; !StopPassName.empty()) { - MFPM.addPass(PrintMIRPass(*OS)); - MFPM.addPass(FreeMachineFunctionPass()); - } + ExitOnErr(LLVMTM.buildCodeGenPipeline( + MPM, *OS, DwoOut ? &DwoOut->os() : nullptr, FileType, Opt, &PIC)); } if (PrintPipelinePasses) { - std::string IRPipeline; - raw_string_ostream IRSOS(IRPipeline); - MPM.printPipeline(IRSOS, [&PIC](StringRef ClassName) { - auto PassName = PIC.getPassNameForClassName(ClassName); - return PassName.empty() ? ClassName : PassName; - }); - outs() << "IR pipeline: " << IRPipeline << '\n'; - - std::string MIRPipeline; - raw_string_ostream MIRSOS(MIRPipeline); - MFPM.printPipeline(MIRSOS, [&PIC](StringRef ClassName) { + std::string PipelineStr; + raw_string_ostream OS(PipelineStr); + MPM.printPipeline(OS, [&PIC](StringRef ClassName) { auto PassName = PIC.getPassNameForClassName(ClassName); return PassName.empty() ? ClassName : PassName; }); - outs() << "MIR pipeline: " << MIRPipeline << '\n'; + outs() << PipelineStr << '\n'; return 0; } - RunPasses(BOS.get(), Out.get(), M.get(), Context, Buffer, &MPM, &MAM, MFPM, - MFAM); + // Before executing passes, print the final values of the LLVM options. + cl::PrintOptionValues(); + + MPM.run(*M, MAM); + + if (Context.getDiagHandlerPtr()->HasErrors) + exit(1); // Declare success. Out->keep(); diff --git a/llvm/unittests/CodeGen/PassManagerTest.cpp b/llvm/unittests/CodeGen/PassManagerTest.cpp index 28003c2f4b3f1..4283eb01a9c8f 100644 --- a/llvm/unittests/CodeGen/PassManagerTest.cpp +++ b/llvm/unittests/CodeGen/PassManagerTest.cpp @@ -5,13 +5,18 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Test that the various MachineFunction pass managers, adaptors, analyses, and +// analysis managers work. +//===----------------------------------------------------------------------===// +#include "llvm/IR/PassManager.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/AsmParser/Parser.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/MC/TargetRegistry.h" @@ -34,14 +39,9 @@ class TestFunctionAnalysis : public AnalysisInfoMixin { int InstructionCount; }; - /// Run the analysis pass over the function and return a result. + /// The number of instructions in the Function. Result run(Function &F, FunctionAnalysisManager &AM) { - int Count = 0; - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) - for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE; - ++II) - ++Count; - return Result(Count); + return Result(F.getInstructionCount()); } private: @@ -59,13 +59,12 @@ class TestMachineFunctionAnalysis int InstructionCount; }; - /// Run the analysis pass over the machine function and return a result. - Result run(MachineFunction &MF, MachineFunctionAnalysisManager::Base &AM) { - auto &MFAM = static_cast(AM); - // Query function analysis result. + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult(MF) + .getManager(); TestFunctionAnalysis::Result &FAR = - MFAM.getResult(MF.getFunction()); - // + 5 + FAM.getResult(MF.getFunction()); return FAR.InstructionCount; } @@ -76,90 +75,54 @@ class TestMachineFunctionAnalysis AnalysisKey TestMachineFunctionAnalysis::Key; -const std::string DoInitErrMsg = "doInitialization failed"; -const std::string DoFinalErrMsg = "doFinalization failed"; - struct TestMachineFunctionPass : public PassInfoMixin { - TestMachineFunctionPass(int &Count, std::vector &BeforeInitialization, - std::vector &BeforeFinalization, - std::vector &MachineFunctionPassCount) - : Count(Count), BeforeInitialization(BeforeInitialization), - BeforeFinalization(BeforeFinalization), - MachineFunctionPassCount(MachineFunctionPassCount) {} - - Error doInitialization(Module &M, MachineFunctionAnalysisManager &MFAM) { - // Force doInitialization fail by starting with big `Count`. - if (Count > 10000) - return make_error(DoInitErrMsg, inconvertibleErrorCode()); - - // + 1 - ++Count; - BeforeInitialization.push_back(Count); - return Error::success(); - } - Error doFinalization(Module &M, MachineFunctionAnalysisManager &MFAM) { - // Force doFinalization fail by starting with big `Count`. - if (Count > 1000) - return make_error(DoFinalErrMsg, inconvertibleErrorCode()); - - // + 1 - ++Count; - BeforeFinalization.push_back(Count); - return Error::success(); - } + TestMachineFunctionPass(int &Count, std::vector &Counts) + : Count(Count), Counts(Counts) {} PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - // Query function analysis result. + FunctionAnalysisManager &FAM = + MFAM.getResult(MF) + .getManager(); TestFunctionAnalysis::Result &FAR = - MFAM.getResult(MF.getFunction()); - // 3 + 1 + 1 = 5 + FAM.getResult(MF.getFunction()); Count += FAR.InstructionCount; - // Query module analysis result. - MachineModuleInfo &MMI = - MFAM.getResult(*MF.getFunction().getParent()) - .getMMI(); - // 1 + 1 + 1 = 3 - Count += (MMI.getModule() == MF.getFunction().getParent()); - - // Query machine function analysis result. TestMachineFunctionAnalysis::Result &MFAR = MFAM.getResult(MF); - // 3 + 1 + 1 = 5 Count += MFAR.InstructionCount; - MachineFunctionPassCount.push_back(Count); + Counts.push_back(Count); return PreservedAnalyses::none(); } int &Count; - std::vector &BeforeInitialization; - std::vector &BeforeFinalization; - std::vector &MachineFunctionPassCount; + std::vector &Counts; }; struct TestMachineModulePass : public PassInfoMixin { - TestMachineModulePass(int &Count, std::vector &MachineModulePassCount) - : Count(Count), MachineModulePassCount(MachineModulePassCount) {} - - Error run(Module &M, MachineFunctionAnalysisManager &MFAM) { - MachineModuleInfo &MMI = MFAM.getResult(M).getMMI(); - // + 1 - Count += (MMI.getModule() == &M); - MachineModulePassCount.push_back(Count); - return Error::success(); - } - - PreservedAnalyses run(MachineFunction &MF, - MachineFunctionAnalysisManager &AM) { - llvm_unreachable( - "This should never be reached because this is machine module pass"); + TestMachineModulePass(int &Count, std::vector &Counts) + : Count(Count), Counts(Counts) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) { + MachineModuleInfo &MMI = MAM.getResult(M).getMMI(); + FunctionAnalysisManager &FAM = + MAM.getResult(M).getManager(); + MachineFunctionAnalysisManager &MFAM = + MAM.getResult(M) + .getManager(); + for (Function &F : M) { + MachineFunction &MF = MMI.getOrCreateMachineFunction(F); + Count += FAM.getResult(F).InstructionCount; + Count += MFAM.getResult(MF).InstructionCount; + } + Counts.push_back(Count); + return PreservedAnalyses::all(); } int &Count; - std::vector &MachineModulePassCount; + std::vector &Counts; }; std::unique_ptr parseIR(LLVMContext &Context, const char *IR) { @@ -211,102 +174,40 @@ TEST_F(PassManagerTest, Basic) { M->setDataLayout(TM->createDataLayout()); MachineModuleInfo MMI(LLVMTM); + LoopAnalysisManager LAM; FunctionAnalysisManager FAM; CGSCCAnalysisManager CGAM; ModuleAnalysisManager MAM; + MachineFunctionAnalysisManager MFAM; PassBuilder PB(TM.get()); PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); PB.registerFunctionAnalyses(FAM); - PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + PB.registerLoopAnalyses(LAM); + PB.registerMachineFunctionAnalyses(MFAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); FAM.registerPass([&] { return TestFunctionAnalysis(); }); - FAM.registerPass([&] { return PassInstrumentationAnalysis(); }); MAM.registerPass([&] { return MachineModuleAnalysis(MMI); }); - MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); - - MachineFunctionAnalysisManager MFAM; - { - // Test move assignment. - MachineFunctionAnalysisManager NestedMFAM(FAM, MAM); - NestedMFAM.registerPass([&] { return PassInstrumentationAnalysis(); }); - NestedMFAM.registerPass([&] { return TestMachineFunctionAnalysis(); }); - MFAM = std::move(NestedMFAM); - } + MFAM.registerPass([&] { return TestMachineFunctionAnalysis(); }); int Count = 0; - std::vector BeforeInitialization[2]; - std::vector BeforeFinalization[2]; - std::vector TestMachineFunctionCount[2]; - std::vector TestMachineModuleCount[2]; + std::vector Counts; + ModulePassManager MPM; MachineFunctionPassManager MFPM; - { - // Test move assignment. - MachineFunctionPassManager NestedMFPM; - NestedMFPM.addPass(TestMachineModulePass(Count, TestMachineModuleCount[0])); - NestedMFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[0], - BeforeFinalization[0], - TestMachineFunctionCount[0])); - NestedMFPM.addPass(TestMachineModulePass(Count, TestMachineModuleCount[1])); - NestedMFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1], - BeforeFinalization[1], - TestMachineFunctionCount[1])); - MFPM = std::move(NestedMFPM); - } + MPM.addPass(TestMachineModulePass(Count, Counts)); + MPM.addPass(createModuleToMachineFunctionPassAdaptor( + TestMachineFunctionPass(Count, Counts))); + MPM.addPass(TestMachineModulePass(Count, Counts)); + MFPM.addPass(TestMachineFunctionPass(Count, Counts)); + MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); + + MPM.run(*M, MAM); - ASSERT_FALSE(errorToBool(MFPM.run(*M, MFAM))); - - // Check first machine module pass - EXPECT_EQ(1u, TestMachineModuleCount[0].size()); - EXPECT_EQ(3, TestMachineModuleCount[0][0]); - - // Check first machine function pass - EXPECT_EQ(1u, BeforeInitialization[0].size()); - EXPECT_EQ(1, BeforeInitialization[0][0]); - EXPECT_EQ(3u, TestMachineFunctionCount[0].size()); - EXPECT_EQ(10, TestMachineFunctionCount[0][0]); - EXPECT_EQ(13, TestMachineFunctionCount[0][1]); - EXPECT_EQ(16, TestMachineFunctionCount[0][2]); - EXPECT_EQ(1u, BeforeFinalization[0].size()); - EXPECT_EQ(31, BeforeFinalization[0][0]); - - // Check second machine module pass - EXPECT_EQ(1u, TestMachineModuleCount[1].size()); - EXPECT_EQ(17, TestMachineModuleCount[1][0]); - - // Check second machine function pass - EXPECT_EQ(1u, BeforeInitialization[1].size()); - EXPECT_EQ(2, BeforeInitialization[1][0]); - EXPECT_EQ(3u, TestMachineFunctionCount[1].size()); - EXPECT_EQ(24, TestMachineFunctionCount[1][0]); - EXPECT_EQ(27, TestMachineFunctionCount[1][1]); - EXPECT_EQ(30, TestMachineFunctionCount[1][2]); - EXPECT_EQ(1u, BeforeFinalization[1].size()); - EXPECT_EQ(32, BeforeFinalization[1][0]); - - EXPECT_EQ(32, Count); - - // doInitialization returns error - Count = 10000; - MFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1], - BeforeFinalization[1], - TestMachineFunctionCount[1])); - std::string Message; - llvm::handleAllErrors(MFPM.run(*M, MFAM), [&](llvm::StringError &Error) { - Message = Error.getMessage(); - }); - EXPECT_EQ(Message, DoInitErrMsg); - - // doFinalization returns error - Count = 1000; - MFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1], - BeforeFinalization[1], - TestMachineFunctionCount[1])); - llvm::handleAllErrors(MFPM.run(*M, MFAM), [&](llvm::StringError &Error) { - Message = Error.getMessage(); - }); - EXPECT_EQ(Message, DoFinalErrMsg); + EXPECT_EQ((std::vector{10, 16, 18, 20, 30, 36, 38, 40}), Counts); + EXPECT_EQ(40, Count); } } // namespace diff --git a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp index 8ecde223cb510..4b7d7846b0336 100644 --- a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp +++ b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/CodeGen/FreeMachineFunction.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Testing/Support/Error.h" @@ -96,8 +99,6 @@ MATCHER_P(HasNameRegex, Name, "") { } struct MockPassInstrumentationCallbacks { - PassInstrumentationCallbacks Callbacks; - MockPassInstrumentationCallbacks() { ON_CALL(*this, runBeforePass(_, _)).WillByDefault(Return(true)); } @@ -111,7 +112,7 @@ struct MockPassInstrumentationCallbacks { MOCK_METHOD2(runBeforeAnalysis, void(StringRef PassID, llvm::Any)); MOCK_METHOD2(runAfterAnalysis, void(StringRef PassID, llvm::Any)); - void registerPassInstrumentation() { + void registerPassInstrumentation(PassInstrumentationCallbacks &Callbacks) { Callbacks.registerShouldRunOptionalPassCallback( [this](StringRef P, llvm::Any IR) { return this->runBeforePass(P, IR); @@ -147,7 +148,8 @@ struct MockPassInstrumentationCallbacks { // to check these explicitly. EXPECT_CALL(*this, runBeforePass(Not(HasNameRegex("Mock")), HasName(IRName))) - .Times(AnyNumber()); + .Times(AnyNumber()) + .WillRepeatedly(Return(false)); EXPECT_CALL( *this, runBeforeSkippedPass(Not(HasNameRegex("Mock")), HasName(IRName))) .Times(AnyNumber()); @@ -157,15 +159,9 @@ struct MockPassInstrumentationCallbacks { EXPECT_CALL(*this, runAfterPass(Not(HasNameRegex("Mock")), HasName(IRName), _)) .Times(AnyNumber()); - EXPECT_CALL(*this, runBeforeAnalysis(HasNameRegex("MachineModuleAnalysis"), - HasName(IRName))) - .Times(AnyNumber()); EXPECT_CALL(*this, runBeforeAnalysis(Not(HasNameRegex("Mock")), HasName(IRName))) .Times(AnyNumber()); - EXPECT_CALL(*this, runAfterAnalysis(HasNameRegex("MachineModuleAnalysis"), - HasName(IRName))) - .Times(AnyNumber()); EXPECT_CALL(*this, runAfterAnalysis(Not(HasNameRegex("Mock")), HasName(IRName))) .Times(AnyNumber()); @@ -202,7 +198,7 @@ template class MockAnalysisHandleBase { } }; - Result run(MachineFunction &IR, MachineFunctionAnalysisManager::Base &AM) { + Result run(MachineFunction &IR, MachineFunctionAnalysisManager &AM) { return Handle->run(IR, AM); } }; @@ -249,7 +245,7 @@ template class MockPassHandleBase { public: PreservedAnalyses run(MachineFunction &IR, - MachineFunctionAnalysisManager::Base &AM) { + MachineFunctionAnalysisManager &AM) { return Handle->run(IR, AM); } }; @@ -270,7 +266,7 @@ template class MockPassHandleBase { struct MockAnalysisHandle : public MockAnalysisHandleBase { MOCK_METHOD2(run, Analysis::Result(MachineFunction &, - MachineFunctionAnalysisManager::Base &)); + MachineFunctionAnalysisManager &)); MOCK_METHOD3(invalidate, bool(MachineFunction &, const PreservedAnalyses &, MachineFunctionAnalysisManager::Invalidator &)); @@ -284,7 +280,7 @@ AnalysisKey MockAnalysisHandleBase::Analysis::Key; class MockPassHandle : public MockPassHandleBase { public: MOCK_METHOD2(run, PreservedAnalyses(MachineFunction &, - MachineFunctionAnalysisManager::Base &)); + MachineFunctionAnalysisManager &)); MockPassHandle() { setDefaults(); } }; @@ -297,50 +293,51 @@ class MachineFunctionCallbacksTest : public testing::Test { InitializeAllTargetMCs(); } + LLVMContext Context; + std::unique_ptr TM; std::unique_ptr MMI; - LLVMContext Context; std::unique_ptr M; - std::unique_ptr MIR; - - MockPassInstrumentationCallbacks CallbacksHandle; - PassBuilder PB; - ModulePassManager PM; - MachineFunctionPassManager MFPM; - FunctionAnalysisManager FAM; - ModuleAnalysisManager AM; + PassInstrumentationCallbacks PIC; + std::unique_ptr PB; + ModulePassManager MPM; MachineFunctionAnalysisManager MFAM; + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + MockPassInstrumentationCallbacks CallbacksHandle; MockPassHandle PassHandle; MockAnalysisHandle AnalysisHandle; - std::unique_ptr parseMIR(const TargetMachine &TM, StringRef MIRCode, - MachineModuleInfo &MMI) { + static std::unique_ptr parseMIR(StringRef MIRCode, + LLVMContext &Context, + TargetMachine &TM, + MachineModuleInfo &MMI) { SMDiagnostic Diagnostic; std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); - MIR = createMIRParser(std::move(MBuffer), Context); - if (!MIR) - return nullptr; + std::unique_ptr MIR = + createMIRParser(std::move(MBuffer), Context); + assert(MIR); std::unique_ptr Mod = MIR->parseIRModule(); - if (!Mod) - return nullptr; + assert(Mod); + // Module identifier is used in tests below. + Mod->setModuleIdentifier("module"); Mod->setDataLayout(TM.createDataLayout()); - if (MIR->parseMachineFunctions(*Mod, MMI)) { - M.reset(); - return nullptr; - } + bool Ret = MIR->parseMachineFunctions(*Mod, MMI); + assert(!Ret); + return Mod; } static PreservedAnalyses - getAnalysisResult(MachineFunction &U, - MachineFunctionAnalysisManager::Base &AM) { - auto &MFAM = static_cast(AM); + getAnalysisResult(MachineFunction &U, MachineFunctionAnalysisManager &MFAM) { MFAM.getResult(U); return PreservedAnalyses::all(); } @@ -356,25 +353,18 @@ class MachineFunctionCallbacksTest : public testing::Test { TripleName, "", "", TargetOptions(), std::nullopt))); if (!TM) GTEST_SKIP(); - MMI = std::make_unique(TM.get()); - M = parseMIR(*TM, MIRString, *MMI); - AM.registerPass([&] { return MachineModuleAnalysis(*MMI); }); - } - MachineFunctionCallbacksTest() - : CallbacksHandle(), PB(nullptr, PipelineTuningOptions(), std::nullopt, - &CallbacksHandle.Callbacks), - PM(), FAM(), AM(), MFAM(FAM, AM) { - - EXPECT_TRUE(&CallbacksHandle.Callbacks == - PB.getPassInstrumentationCallbacks()); + MMI = std::make_unique(TM.get()); + M = parseMIR(MIRString, Context, *TM, *MMI); + PB = std::make_unique(TM.get(), PipelineTuningOptions(), + std::nullopt, &PIC); /// Register a callback for analysis registration. /// /// The callback is a function taking a reference to an AnalyisManager /// object. When called, the callee gets to register its own analyses with /// this PassBuilder instance. - PB.registerAnalysisRegistrationCallback( + PB->registerAnalysisRegistrationCallback( [this](MachineFunctionAnalysisManager &AM) { // Register our mock analysis AM.registerPass([this] { return AnalysisHandle.getAnalysis(); }); @@ -386,24 +376,29 @@ class MachineFunctionCallbacksTest : public testing::Test { /// callbacks for each encountered pass name that it does not know. This /// includes both simple pass names as well as names of sub-pipelines. In /// the latter case, the InnerPipeline is not empty. - PB.registerPipelineParsingCallback( - [this](StringRef Name, MachineFunctionPassManager &PM) { + PB->registerPipelineParsingCallback( + [this](StringRef Name, MachineFunctionPassManager &PM, + ArrayRef InnerPipeline) { if (parseAnalysisUtilityPasses( "test-analysis", Name, PM)) return true; /// Parse the name of our pass mock handle if (Name == "test-transform") { - MFPM.addPass(PassHandle.getPass()); + PM.addPass(PassHandle.getPass()); return true; } return false; }); /// Register builtin analyses and cross-register the analysis proxies - PB.registerModuleAnalyses(AM); - PB.registerFunctionAnalyses(FAM); - PB.registerMachineFunctionAnalyses(MFAM); + PB->registerModuleAnalyses(MAM); + PB->registerCGSCCAnalyses(CGAM); + PB->registerFunctionAnalyses(FAM); + PB->registerLoopAnalyses(LAM); + PB->registerMachineFunctionAnalyses(MFAM); + PB->crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM); + MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); }); } }; @@ -412,53 +407,58 @@ TEST_F(MachineFunctionCallbacksTest, Passes) { EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded()) + ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded()) << "Pipeline was: " << PipelineText; - ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded()); + MPM.run(*M, MAM); } TEST_F(MachineFunctionCallbacksTest, InstrumentedPasses) { - CallbacksHandle.registerPassInstrumentation(); + CallbacksHandle.registerPassInstrumentation(PIC); // Non-mock instrumentation not specifically mentioned below can be ignored. - CallbacksHandle.ignoreNonMockPassInstrumentation(""); CallbacksHandle.ignoreNonMockPassInstrumentation("test"); - CallbacksHandle.ignoreNonMockPassInstrumentation(""); + CallbacksHandle.ignoreNonMockPassInstrumentation("module"); // PassInstrumentation calls should happen in-sequence, in the same order // as passes/analyses are scheduled. ::testing::Sequence PISequence; EXPECT_CALL(CallbacksHandle, runBeforePass(HasNameRegex("MockPassHandle"), HasName("test"))) - .InSequence(PISequence); + .InSequence(PISequence) + .WillOnce(Return(true)); EXPECT_CALL( CallbacksHandle, runBeforeNonSkippedPass(HasNameRegex("MockPassHandle"), HasName("test"))) .InSequence(PISequence); - EXPECT_CALL(CallbacksHandle, - runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), _)) + EXPECT_CALL( + CallbacksHandle, + runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test"))) .InSequence(PISequence); - EXPECT_CALL(CallbacksHandle, - runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), _)) + EXPECT_CALL( + CallbacksHandle, + runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test"))) .InSequence(PISequence); EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("MockPassHandle"), HasName("test"), _)) .InSequence(PISequence); + EXPECT_CALL( + CallbacksHandle, + runBeforeSkippedPass(HasNameRegex("MockPassHandle"), HasName("test"))) + .Times(0); EXPECT_CALL(AnalysisHandle, run(HasName("test"), _)); EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded()) + ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded()) << "Pipeline was: " << PipelineText; - ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded()); + MPM.run(*M, MAM); } TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) { - CallbacksHandle.registerPassInstrumentation(); + CallbacksHandle.registerPassInstrumentation(PIC); // Non-mock instrumentation run here can safely be ignored. - CallbacksHandle.ignoreNonMockPassInstrumentation(""); CallbacksHandle.ignoreNonMockPassInstrumentation("test"); - CallbacksHandle.ignoreNonMockPassInstrumentation(""); + CallbacksHandle.ignoreNonMockPassInstrumentation("module"); // Skip the pass by returning false. EXPECT_CALL(CallbacksHandle, @@ -495,9 +495,81 @@ TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded()) + ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded()) << "Pipeline was: " << PipelineText; - ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded()); + MPM.run(*M, MAM); +} + +// Check that the Module -> MachineFunction adaptor properly calls +// runAfterPassInvalidated. +TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass) { + CallbacksHandle.registerPassInstrumentation(PIC); + // Non-mock instrumentation run here can safely be ignored. + CallbacksHandle.ignoreNonMockPassInstrumentation("test"); + CallbacksHandle.ignoreNonMockPassInstrumentation("module"); + + ::testing::Sequence PISequence; + EXPECT_CALL( + CallbacksHandle, + runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test"))) + .InSequence(PISequence) + .WillOnce(Return(true)); + EXPECT_CALL(CallbacksHandle, + runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"), + HasName("test"))) + .InSequence(PISequence); + EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated( + HasNameRegex("FreeMachineFunctionPass"), _)) + .InSequence(PISequence); + + // runAfterPass should not be called since the MachineFunction is no longer + // valid after FreeMachineFunctionPass. + EXPECT_CALL(CallbacksHandle, + runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _)) + .Times(0); + + MPM.addPass( + createModuleToMachineFunctionPassAdaptor(FreeMachineFunctionPass())); + MPM.run(*M, MAM); +} + +// Check that the Module -> MachineFunction adaptor and MachineFunction pass +// manager properly call runAfterPassInvalidated. +TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass2) { + CallbacksHandle.registerPassInstrumentation(PIC); + // Non-mock instrumentation run here can safely be ignored. + CallbacksHandle.ignoreNonMockPassInstrumentation("test"); + CallbacksHandle.ignoreNonMockPassInstrumentation("module"); + + ::testing::Sequence PISequence; + EXPECT_CALL( + CallbacksHandle, + runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test"))) + .InSequence(PISequence) + .WillOnce(Return(true)); + EXPECT_CALL(CallbacksHandle, + runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"), + HasName("test"))) + .InSequence(PISequence); + EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated( + HasNameRegex("FreeMachineFunctionPass"), _)) + .InSequence(PISequence); + EXPECT_CALL(CallbacksHandle, + runAfterPassInvalidated(HasNameRegex("PassManager"), _)) + .InSequence(PISequence); + + // runAfterPass should not be called since the MachineFunction is no longer + // valid after FreeMachineFunctionPass. + EXPECT_CALL(CallbacksHandle, + runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _)) + .Times(0); + EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("PassManager"), _, _)) + .Times(0); + + MachineFunctionPassManager MFPM; + MFPM.addPass(FreeMachineFunctionPass()); + MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM))); + MPM.run(*M, MAM); } } // end anonymous namespace From 7f71fa909a10be182b82b9dfaf0fade6eb84796c Mon Sep 17 00:00:00 2001 From: Thomas Preud'homme Date: Thu, 22 Feb 2024 21:01:05 +0000 Subject: [PATCH 271/351] Extend GCC workaround to GCC < 8.4 for llvm::iterator_range ctor (#82643) GCC SFINAE error with decltype was fixed in commit ac5e28911abdfb8d9bf6bea980223e199bbcf28d which made it into GCC 8.4. Therefore adjust GCC version test accordingly. --- llvm/include/llvm/ADT/iterator_range.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h index 2dc227935984b..7d288ea4506ba 100644 --- a/llvm/include/llvm/ADT/iterator_range.h +++ b/llvm/include/llvm/ADT/iterator_range.h @@ -43,8 +43,8 @@ class iterator_range { IteratorT begin_iterator, end_iterator; public: -#if __GNUC__ == 7 - // Be careful no to break gcc-7 on the mlir target. +#if __GNUC__ == 7 || (__GNUC__ == 8 && __GNUC_MINOR__ < 4) + // Be careful no to break gcc-7 and gcc-8 < 8.4 on the mlir target. // See https://github.com/llvm/llvm-project/issues/63843 template #else From df6f756a19277d936ec83f7cebc2501327ac3add Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Thu, 22 Feb 2024 16:11:40 -0500 Subject: [PATCH 272/351] Re-land [lldb-dap] Add support for data breakpoint. (#81909) This implements functionality to handle DataBreakpointInfo request and SetDataBreakpoints request. Previous commit https://github.com/llvm/llvm-project/commit/8c56e78ec531f0e2460213c20fff869b6b7add99 was reverted because setting 1 byte watchpoint failed in the new test on ARM64. So, I changed the test to setting 4 byte watchpoint instead, and hope this won't break it again. It also adds the fixes from https://github.com/llvm/llvm-project/pull/81680. --- .../test/tools/lldb-dap/dap_server.py | 47 +++ .../tools/lldb-dap/databreakpoint/Makefile | 3 + .../TestDAP_setDataBreakpoints.py | 131 +++++++ .../tools/lldb-dap/databreakpoint/main.cpp | 17 + lldb/tools/lldb-dap/CMakeLists.txt | 1 + lldb/tools/lldb-dap/DAPForward.h | 2 + lldb/tools/lldb-dap/Watchpoint.cpp | 48 +++ lldb/tools/lldb-dap/Watchpoint.h | 34 ++ lldb/tools/lldb-dap/lldb-dap.cpp | 341 ++++++++++++++++-- 9 files changed, 590 insertions(+), 34 deletions(-) create mode 100644 lldb/test/API/tools/lldb-dap/databreakpoint/Makefile create mode 100644 lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py create mode 100644 lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp create mode 100644 lldb/tools/lldb-dap/Watchpoint.cpp create mode 100644 lldb/tools/lldb-dap/Watchpoint.h diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index bb863bb871917..27a76a652f406 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -501,6 +501,18 @@ def get_local_variable_value(self, name, frameIndex=0, threadId=None): return variable["value"] return None + def get_local_variable_child(self, name, child_name, frameIndex=0, threadId=None): + local = self.get_local_variable(name, frameIndex, threadId) + if local["variablesReference"] == 0: + return None + children = self.request_variables(local["variablesReference"])["body"][ + "variables" + ] + for child in children: + if child["name"] == child_name: + return child + return None + def replay_packets(self, replay_file_path): f = open(replay_file_path, "r") mode = "invalid" @@ -895,6 +907,41 @@ def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=Non } return self.send_recv(command_dict) + def request_dataBreakpointInfo( + self, variablesReference, name, frameIndex=0, threadId=None + ): + stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId) + if stackFrame is None: + return [] + args_dict = { + "variablesReference": variablesReference, + "name": name, + "frameId": stackFrame["id"], + } + command_dict = { + "command": "dataBreakpointInfo", + "type": "request", + "arguments": args_dict, + } + return self.send_recv(command_dict) + + def request_setDataBreakpoint(self, dataBreakpoints): + """dataBreakpoints is a list of dictionary with following fields: + { + dataId: (address in hex)/(size in bytes) + accessType: read/write/readWrite + [condition]: string + [hitCondition]: string + } + """ + args_dict = {"breakpoints": dataBreakpoints} + command_dict = { + "command": "setDataBreakpoints", + "type": "request", + "arguments": args_dict, + } + return self.send_recv(command_dict) + def request_compileUnits(self, moduleId): args_dict = {"moduleId": moduleId} command_dict = { diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/Makefile b/lldb/test/API/tools/lldb-dap/databreakpoint/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/databreakpoint/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py new file mode 100644 index 0000000000000..17cdad89aa6d1 --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py @@ -0,0 +1,131 @@ +""" +Test lldb-dap dataBreakpointInfo and setDataBreakpoints requests +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +import lldbdap_testcase + + +class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase): + def setUp(self): + lldbdap_testcase.DAPTestCaseBase.setUp(self) + self.accessTypes = ["read", "write", "readWrite"] + + @skipIfWindows + @skipIfRemote + def test_expression(self): + """Tests setting data breakpoints on expression.""" + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + source = "main.cpp" + first_loop_break_line = line_number(source, "// first loop breakpoint") + self.set_source_breakpoints(source, [first_loop_break_line]) + self.continue_to_next_stop() + self.dap_server.get_stackFrame() + # Test setting write watchpoint using expressions: &x, arr+2 + response_x = self.dap_server.request_dataBreakpointInfo(0, "&x") + response_arr_2 = self.dap_server.request_dataBreakpointInfo(0, "arr+2") + # Test response from dataBreakpointInfo request. + self.assertEquals(response_x["body"]["dataId"].split("/")[1], "4") + self.assertEquals(response_x["body"]["accessTypes"], self.accessTypes) + self.assertEquals(response_arr_2["body"]["dataId"].split("/")[1], "4") + self.assertEquals(response_arr_2["body"]["accessTypes"], self.accessTypes) + dataBreakpoints = [ + {"dataId": response_x["body"]["dataId"], "accessType": "write"}, + {"dataId": response_arr_2["body"]["dataId"], "accessType": "write"}, + ] + set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints) + self.assertEquals( + set_response["body"]["breakpoints"], + [{"verified": True}, {"verified": True}], + ) + + self.continue_to_next_stop() + x_val = self.dap_server.get_local_variable_value("x") + i_val = self.dap_server.get_local_variable_value("i") + self.assertEquals(x_val, "2") + self.assertEquals(i_val, "1") + + self.continue_to_next_stop() + arr_2 = self.dap_server.get_local_variable_child("arr", "[2]") + i_val = self.dap_server.get_local_variable_value("i") + self.assertEquals(arr_2["value"], "42") + self.assertEquals(i_val, "2") + + @skipIfWindows + @skipIfRemote + def test_functionality(self): + """Tests setting data breakpoints on variable.""" + program = self.getBuildArtifact("a.out") + self.build_and_launch(program) + source = "main.cpp" + first_loop_break_line = line_number(source, "// first loop breakpoint") + self.set_source_breakpoints(source, [first_loop_break_line]) + self.continue_to_next_stop() + self.dap_server.get_local_variables() + # Test write watchpoints on x, arr[2] + response_x = self.dap_server.request_dataBreakpointInfo(1, "x") + arr = self.dap_server.get_local_variable("arr") + response_arr_2 = self.dap_server.request_dataBreakpointInfo( + arr["variablesReference"], "[2]" + ) + + # Test response from dataBreakpointInfo request. + self.assertEquals(response_x["body"]["dataId"].split("/")[1], "4") + self.assertEquals(response_x["body"]["accessTypes"], self.accessTypes) + self.assertEquals(response_arr_2["body"]["dataId"].split("/")[1], "4") + self.assertEquals(response_arr_2["body"]["accessTypes"], self.accessTypes) + dataBreakpoints = [ + {"dataId": response_x["body"]["dataId"], "accessType": "write"}, + {"dataId": response_arr_2["body"]["dataId"], "accessType": "write"}, + ] + set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints) + self.assertEquals( + set_response["body"]["breakpoints"], + [{"verified": True}, {"verified": True}], + ) + + self.continue_to_next_stop() + x_val = self.dap_server.get_local_variable_value("x") + i_val = self.dap_server.get_local_variable_value("i") + self.assertEquals(x_val, "2") + self.assertEquals(i_val, "1") + + self.continue_to_next_stop() + arr_2 = self.dap_server.get_local_variable_child("arr", "[2]") + i_val = self.dap_server.get_local_variable_value("i") + self.assertEquals(arr_2["value"], "42") + self.assertEquals(i_val, "2") + self.dap_server.request_setDataBreakpoint([]) + + # Test hit condition + second_loop_break_line = line_number(source, "// second loop breakpoint") + breakpoint_ids = self.set_source_breakpoints(source, [second_loop_break_line]) + self.continue_to_breakpoints(breakpoint_ids) + dataBreakpoints = [ + { + "dataId": response_x["body"]["dataId"], + "accessType": "write", + "hitCondition": "2", + } + ] + set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints) + self.assertEquals(set_response["body"]["breakpoints"], [{"verified": True}]) + self.continue_to_next_stop() + x_val = self.dap_server.get_local_variable_value("x") + self.assertEquals(x_val, "3") + + # Test condition + dataBreakpoints = [ + { + "dataId": response_x["body"]["dataId"], + "accessType": "write", + "condition": "x==10", + } + ] + set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints) + self.assertEquals(set_response["body"]["breakpoints"], [{"verified": True}]) + self.continue_to_next_stop() + x_val = self.dap_server.get_local_variable_value("x") + self.assertEquals(x_val, "10") diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp b/lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp new file mode 100644 index 0000000000000..bef09c203845e --- /dev/null +++ b/lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp @@ -0,0 +1,17 @@ +int main(int argc, char const *argv[]) { + // Test for data breakpoint + int x = 0; + int arr[4] = {1, 2, 3, 4}; + for (int i = 0; i < 5; ++i) { // first loop breakpoint + if (i == 1) { + x = i + 1; + } else if (i == 2) { + arr[i] = 42; + } + } + + x = 1; + for (int i = 0; i < 10; ++i) { // second loop breakpoint + ++x; + } +} diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index f8c0e4ecf36c2..f8f0d86453f58 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -37,6 +37,7 @@ add_lldb_tool(lldb-dap RunInTerminal.cpp SourceBreakpoint.cpp DAP.cpp + Watchpoint.cpp LINK_LIBS liblldb diff --git a/lldb/tools/lldb-dap/DAPForward.h b/lldb/tools/lldb-dap/DAPForward.h index fffff1e3f7902..8c79488fae8db 100644 --- a/lldb/tools/lldb-dap/DAPForward.h +++ b/lldb/tools/lldb-dap/DAPForward.h @@ -14,6 +14,7 @@ struct BreakpointBase; struct ExceptionBreakpoint; struct FunctionBreakpoint; struct SourceBreakpoint; +struct Watchpoint; } // namespace lldb_dap namespace lldb { @@ -39,6 +40,7 @@ class SBStringList; class SBTarget; class SBThread; class SBValue; +class SBWatchpoint; } // namespace lldb #endif diff --git a/lldb/tools/lldb-dap/Watchpoint.cpp b/lldb/tools/lldb-dap/Watchpoint.cpp new file mode 100644 index 0000000000000..2f176e0da84f1 --- /dev/null +++ b/lldb/tools/lldb-dap/Watchpoint.cpp @@ -0,0 +1,48 @@ +//===-- Watchpoint.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Watchpoint.h" +#include "DAP.h" +#include "JSONUtils.h" +#include "llvm/ADT/StringExtras.h" + +namespace lldb_dap { +Watchpoint::Watchpoint(const llvm::json::Object &obj) : BreakpointBase(obj) { + llvm::StringRef dataId = GetString(obj, "dataId"); + std::string accessType = GetString(obj, "accessType").str(); + auto [addr_str, size_str] = dataId.split('/'); + lldb::addr_t addr; + size_t size; + llvm::to_integer(addr_str, addr, 16); + llvm::to_integer(size_str, size); + lldb::SBWatchpointOptions options; + options.SetWatchpointTypeRead(accessType != "write"); + if (accessType != "read") + options.SetWatchpointTypeWrite(lldb::eWatchpointWriteTypeOnModify); + wp = g_dap.target.WatchpointCreateByAddress(addr, size, options, error); + SetCondition(); + SetHitCondition(); +} + +void Watchpoint::SetCondition() { wp.SetCondition(condition.c_str()); } + +void Watchpoint::SetHitCondition() { + uint64_t hitCount = 0; + if (llvm::to_integer(hitCondition, hitCount)) + wp.SetIgnoreCount(hitCount - 1); +} + +void Watchpoint::CreateJsonObject(llvm::json::Object &object) { + if (error.Success()) { + object.try_emplace("verified", true); + } else { + object.try_emplace("verified", false); + EmplaceSafeString(object, "message", error.GetCString()); + } +} +} // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Watchpoint.h b/lldb/tools/lldb-dap/Watchpoint.h new file mode 100644 index 0000000000000..026b07d67241c --- /dev/null +++ b/lldb/tools/lldb-dap/Watchpoint.h @@ -0,0 +1,34 @@ +//===-- Watchpoint.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TOOLS_LLDB_DAP_WATCHPOINT_H +#define LLDB_TOOLS_LLDB_DAP_WATCHPOINT_H + +#include "BreakpointBase.h" +#include "lldb/API/SBError.h" +#include "lldb/API/SBWatchpoint.h" +#include "lldb/API/SBWatchpointOptions.h" + +namespace lldb_dap { + +struct Watchpoint : public BreakpointBase { + // The LLDB breakpoint associated wit this watchpoint. + lldb::SBWatchpoint wp; + lldb::SBError error; + + Watchpoint() = default; + Watchpoint(const llvm::json::Object &obj); + Watchpoint(lldb::SBWatchpoint wp) : wp(wp) {} + + void SetCondition() override; + void SetHitCondition() override; + void CreateJsonObject(llvm::json::Object &object) override; +}; +} // namespace lldb_dap + +#endif diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index 78b0b4078706a..c6a275bcf8140 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "DAP.h" +#include "Watchpoint.h" +#include "lldb/API/SBMemoryRegionInfo.h" #include #include @@ -560,6 +562,46 @@ void EventThreadFunction() { } } +lldb::SBValue FindVariable(uint64_t variablesReference, llvm::StringRef name) { + lldb::SBValue variable; + if (lldb::SBValueList *top_scope = GetTopLevelScope(variablesReference)) { + bool is_duplicated_variable_name = name.contains(" @"); + // variablesReference is one of our scopes, not an actual variable it is + // asking for a variable in locals or globals or registers + int64_t end_idx = top_scope->GetSize(); + // Searching backward so that we choose the variable in closest scope + // among variables of the same name. + for (int64_t i = end_idx - 1; i >= 0; --i) { + lldb::SBValue curr_variable = top_scope->GetValueAtIndex(i); + std::string variable_name = CreateUniqueVariableNameForDisplay( + curr_variable, is_duplicated_variable_name); + if (variable_name == name) { + variable = curr_variable; + break; + } + } + } else { + // This is not under the globals or locals scope, so there are no duplicated + // names. + + // We have a named item within an actual variable so we need to find it + // withing the container variable by name. + lldb::SBValue container = g_dap.variables.GetVariable(variablesReference); + variable = container.GetChildMemberWithName(name.data()); + if (!variable.IsValid()) { + if (name.starts_with("[")) { + llvm::StringRef index_str(name.drop_front(1)); + uint64_t index = 0; + if (!index_str.consumeInteger(0, index)) { + if (index_str == "]") + variable = container.GetChildAtIndex(index); + } + } + } + } + return variable; +} + // Both attach and launch take a either a sourcePath or sourceMap // argument (or neither), from which we need to set the target.source-map. void SetSourceMapFromArguments(const llvm::json::Object &arguments) { @@ -1647,6 +1689,8 @@ void request_initialize(const llvm::json::Object &request) { body.try_emplace("supportsProgressReporting", true); // The debug adapter supports 'logMessage' in breakpoint. body.try_emplace("supportsLogPoints", true); + // The debug adapter supports data watchpoints. + body.try_emplace("supportsDataBreakpoints", true); response.try_emplace("body", std::move(body)); g_dap.SendJSON(llvm::json::Value(std::move(response))); @@ -2593,6 +2637,264 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) { g_dap.SendJSON(llvm::json::Value(std::move(response))); } +// "DataBreakpointInfoRequest": { +// "allOf": [ { "$ref": "#/definitions/Request" }, { +// "type": "object", +// "description": "Obtains information on a possible data breakpoint that +// could be set on an expression or variable.\nClients should only call this +// request if the corresponding capability `supportsDataBreakpoints` is +// true.", "properties": { +// "command": { +// "type": "string", +// "enum": [ "dataBreakpointInfo" ] +// }, +// "arguments": { +// "$ref": "#/definitions/DataBreakpointInfoArguments" +// } +// }, +// "required": [ "command", "arguments" ] +// }] +// }, +// "DataBreakpointInfoArguments": { +// "type": "object", +// "description": "Arguments for `dataBreakpointInfo` request.", +// "properties": { +// "variablesReference": { +// "type": "integer", +// "description": "Reference to the variable container if the data +// breakpoint is requested for a child of the container. The +// `variablesReference` must have been obtained in the current suspended +// state. See 'Lifetime of Object References' in the Overview section for +// details." +// }, +// "name": { +// "type": "string", +// "description": "The name of the variable's child to obtain data +// breakpoint information for.\nIf `variablesReference` isn't specified, +// this can be an expression." +// }, +// "frameId": { +// "type": "integer", +// "description": "When `name` is an expression, evaluate it in the scope +// of this stack frame. If not specified, the expression is evaluated in +// the global scope. When `variablesReference` is specified, this property +// has no effect." +// } +// }, +// "required": [ "name" ] +// }, +// "DataBreakpointInfoResponse": { +// "allOf": [ { "$ref": "#/definitions/Response" }, { +// "type": "object", +// "description": "Response to `dataBreakpointInfo` request.", +// "properties": { +// "body": { +// "type": "object", +// "properties": { +// "dataId": { +// "type": [ "string", "null" ], +// "description": "An identifier for the data on which a data +// breakpoint can be registered with the `setDataBreakpoints` +// request or null if no data breakpoint is available. If a +// `variablesReference` or `frameId` is passed, the `dataId` is +// valid in the current suspended state, otherwise it's valid +// indefinitely. See 'Lifetime of Object References' in the Overview +// section for details. Breakpoints set using the `dataId` in the +// `setDataBreakpoints` request may outlive the lifetime of the +// associated `dataId`." +// }, +// "description": { +// "type": "string", +// "description": "UI string that describes on what data the +// breakpoint is set on or why a data breakpoint is not available." +// }, +// "accessTypes": { +// "type": "array", +// "items": { +// "$ref": "#/definitions/DataBreakpointAccessType" +// }, +// "description": "Attribute lists the available access types for a +// potential data breakpoint. A UI client could surface this +// information." +// }, +// "canPersist": { +// "type": "boolean", +// "description": "Attribute indicates that a potential data +// breakpoint could be persisted across sessions." +// } +// }, +// "required": [ "dataId", "description" ] +// } +// }, +// "required": [ "body" ] +// }] +// } +void request_dataBreakpointInfo(const llvm::json::Object &request) { + llvm::json::Object response; + FillResponse(request, response); + llvm::json::Object body; + lldb::SBError error; + llvm::json::Array accessTypes{"read", "write", "readWrite"}; + const auto *arguments = request.getObject("arguments"); + const auto variablesReference = + GetUnsigned(arguments, "variablesReference", 0); + llvm::StringRef name = GetString(arguments, "name"); + lldb::SBFrame frame = g_dap.GetLLDBFrame(*arguments); + lldb::SBValue variable = FindVariable(variablesReference, name); + std::string addr, size; + + if (variable.IsValid()) { + lldb::addr_t load_addr = variable.GetLoadAddress(); + size_t byte_size = variable.GetByteSize(); + if (load_addr == LLDB_INVALID_ADDRESS) { + body.try_emplace("dataId", nullptr); + body.try_emplace("description", + "does not exist in memory, its location is " + + std::string(variable.GetLocation())); + } else if (byte_size == 0) { + body.try_emplace("dataId", nullptr); + body.try_emplace("description", "variable size is 0"); + } else { + addr = llvm::utohexstr(load_addr); + size = llvm::utostr(byte_size); + } + } else if (variablesReference == 0 && frame.IsValid()) { + lldb::SBValue value = frame.EvaluateExpression(name.data()); + if (value.GetError().Fail()) { + lldb::SBError error = value.GetError(); + const char *error_cstr = error.GetCString(); + body.try_emplace("dataId", nullptr); + body.try_emplace("description", error_cstr && error_cstr[0] + ? std::string(error_cstr) + : "evaluation failed"); + } else { + uint64_t load_addr = value.GetValueAsUnsigned(); + addr = llvm::utohexstr(load_addr); + lldb::SBMemoryRegionInfo region; + lldb::SBError err = + g_dap.target.GetProcess().GetMemoryRegionInfo(load_addr, region); + if (err.Success()) { + if (!(region.IsReadable() || region.IsWritable())) { + body.try_emplace("dataId", nullptr); + body.try_emplace("description", + "memory region for address " + addr + + " has no read or write permissions"); + } else { + lldb::SBData data = value.GetPointeeData(); + if (data.IsValid()) + size = llvm::utostr(data.GetByteSize()); + else { + body.try_emplace("dataId", nullptr); + body.try_emplace("description", + "unable to get byte size for expression: " + + name.str()); + } + } + } else { + body.try_emplace("dataId", nullptr); + body.try_emplace("description", + "unable to get memory region info for address " + + addr); + } + } + } else { + body.try_emplace("dataId", nullptr); + body.try_emplace("description", "variable not found: " + name.str()); + } + + if (!body.getObject("dataId")) { + body.try_emplace("dataId", addr + "/" + size); + body.try_emplace("accessTypes", std::move(accessTypes)); + body.try_emplace("description", + size + " bytes at " + addr + " " + name.str()); + } + response.try_emplace("body", std::move(body)); + g_dap.SendJSON(llvm::json::Value(std::move(response))); +} + +// "SetDataBreakpointsRequest": { +// "allOf": [ { "$ref": "#/definitions/Request" }, { +// "type": "object", +// "description": "Replaces all existing data breakpoints with new data +// breakpoints.\nTo clear all data breakpoints, specify an empty +// array.\nWhen a data breakpoint is hit, a `stopped` event (with reason +// `data breakpoint`) is generated.\nClients should only call this request +// if the corresponding capability `supportsDataBreakpoints` is true.", +// "properties": { +// "command": { +// "type": "string", +// "enum": [ "setDataBreakpoints" ] +// }, +// "arguments": { +// "$ref": "#/definitions/SetDataBreakpointsArguments" +// } +// }, +// "required": [ "command", "arguments" ] +// }] +// }, +// "SetDataBreakpointsArguments": { +// "type": "object", +// "description": "Arguments for `setDataBreakpoints` request.", +// "properties": { +// "breakpoints": { +// "type": "array", +// "items": { +// "$ref": "#/definitions/DataBreakpoint" +// }, +// "description": "The contents of this array replaces all existing data +// breakpoints. An empty array clears all data breakpoints." +// } +// }, +// "required": [ "breakpoints" ] +// }, +// "SetDataBreakpointsResponse": { +// "allOf": [ { "$ref": "#/definitions/Response" }, { +// "type": "object", +// "description": "Response to `setDataBreakpoints` request.\nReturned is +// information about each breakpoint created by this request.", +// "properties": { +// "body": { +// "type": "object", +// "properties": { +// "breakpoints": { +// "type": "array", +// "items": { +// "$ref": "#/definitions/Breakpoint" +// }, +// "description": "Information about the data breakpoints. The array +// elements correspond to the elements of the input argument +// `breakpoints` array." +// } +// }, +// "required": [ "breakpoints" ] +// } +// }, +// "required": [ "body" ] +// }] +// } +void request_setDataBreakpoints(const llvm::json::Object &request) { + llvm::json::Object response; + lldb::SBError error; + FillResponse(request, response); + const auto *arguments = request.getObject("arguments"); + const auto *breakpoints = arguments->getArray("breakpoints"); + llvm::json::Array response_breakpoints; + g_dap.target.DeleteAllWatchpoints(); + if (breakpoints) { + for (const auto &bp : *breakpoints) { + const auto *bp_obj = bp.getAsObject(); + if (bp_obj) { + Watchpoint wp(*bp_obj); + AppendBreakpoint(&wp, response_breakpoints); + } + } + } + llvm::json::Object body; + body.try_emplace("breakpoints", std::move(response_breakpoints)); + response.try_emplace("body", std::move(body)); + g_dap.SendJSON(llvm::json::Value(std::move(response))); +} + // "SourceRequest": { // "allOf": [ { "$ref": "#/definitions/Request" }, { // "type": "object", @@ -3076,7 +3378,6 @@ void request_setVariable(const llvm::json::Object &request) { const auto variablesReference = GetUnsigned(arguments, "variablesReference", 0); llvm::StringRef name = GetString(arguments, "name"); - bool is_duplicated_variable_name = name.contains(" @"); const auto value = GetString(arguments, "value"); // Set success to false just in case we don't find the variable by name @@ -3097,40 +3398,8 @@ void request_setVariable(const llvm::json::Object &request) { const auto id_value = GetUnsigned(arguments, "id", UINT64_MAX); if (id_value != UINT64_MAX) { variable = g_dap.variables.GetVariable(id_value); - } else if (lldb::SBValueList *top_scope = - GetTopLevelScope(variablesReference)) { - // variablesReference is one of our scopes, not an actual variable it is - // asking for a variable in locals or globals or registers - int64_t end_idx = top_scope->GetSize(); - // Searching backward so that we choose the variable in closest scope - // among variables of the same name. - for (int64_t i = end_idx - 1; i >= 0; --i) { - lldb::SBValue curr_variable = top_scope->GetValueAtIndex(i); - std::string variable_name = CreateUniqueVariableNameForDisplay( - curr_variable, is_duplicated_variable_name); - if (variable_name == name) { - variable = curr_variable; - break; - } - } } else { - // This is not under the globals or locals scope, so there are no duplicated - // names. - - // We have a named item within an actual variable so we need to find it - // withing the container variable by name. - lldb::SBValue container = g_dap.variables.GetVariable(variablesReference); - variable = container.GetChildMemberWithName(name.data()); - if (!variable.IsValid()) { - if (name.starts_with("[")) { - llvm::StringRef index_str(name.drop_front(1)); - uint64_t index = 0; - if (!index_str.consumeInteger(0, index)) { - if (index_str == "]") - variable = container.GetChildAtIndex(index); - } - } - } + variable = FindVariable(variablesReference, name); } if (variable.IsValid()) { @@ -3613,6 +3882,10 @@ void RegisterRequestCallbacks() { request_setExceptionBreakpoints); g_dap.RegisterRequestCallback("setFunctionBreakpoints", request_setFunctionBreakpoints); + g_dap.RegisterRequestCallback("dataBreakpointInfo", + request_dataBreakpointInfo); + g_dap.RegisterRequestCallback("setDataBreakpoints", + request_setDataBreakpoints); g_dap.RegisterRequestCallback("setVariable", request_setVariable); g_dap.RegisterRequestCallback("source", request_source); g_dap.RegisterRequestCallback("stackTrace", request_stackTrace); From 0eb64eebdecb3c138b4adfea1cbcdd03aa7d455c Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 22 Feb 2024 21:12:51 +0000 Subject: [PATCH 273/351] [gn build] Port df6f756a1927 --- llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn index 98c2068f6da29..8cb60fd81840f 100644 --- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn @@ -51,6 +51,7 @@ executable("lldb-dap") { "ProgressEvent.cpp", "RunInTerminal.cpp", "SourceBreakpoint.cpp", + "Watchpoint.cpp", "lldb-dap.cpp", ] } From 45fe67dd61a6ac7df84d3a586e41c36a4767757f Mon Sep 17 00:00:00 2001 From: Daniel Martinez Date: Thu, 22 Feb 2024 21:14:27 +0000 Subject: [PATCH 274/351] Fix build on musl by including stdint.h (#81434) openmp fails to build on musl since it lacks the defines for int32_t Co-authored-by: Daniel Martinez --- openmp/libomptarget/include/Shared/SourceInfo.h | 1 + 1 file changed, 1 insertion(+) diff --git a/openmp/libomptarget/include/Shared/SourceInfo.h b/openmp/libomptarget/include/Shared/SourceInfo.h index 7ce5fd43efc07..711f06a04d017 100644 --- a/openmp/libomptarget/include/Shared/SourceInfo.h +++ b/openmp/libomptarget/include/Shared/SourceInfo.h @@ -13,6 +13,7 @@ #ifndef OMPTARGET_SHARED_SOURCE_INFO_H #define OMPTARGET_SHARED_SOURCE_INFO_H +#include #include #ifdef _WIN32 From 47b7c91abe7af3133a591aa2e73fffa30826f986 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 15:29:29 -0600 Subject: [PATCH 275/351] [libc] Rework the GPU build to be a regular target (#81921) Summary: This is a massive patch because it reworks the entire build and everything that depends on it. This is not split up because various bots would fail otherwise. I will attempt to describe the necessary changes here. This patch completely reworks how the GPU build is built and targeted. Previously, we used a standard runtimes build and handled both NVPTX and AMDGPU in a single build via multi-targeting. This added a lot of divergence in the build system and prevented us from doing various things like building for the CPU / GPU at the same time, or exporting the startup libraries or running tests without a full rebuild. The new appraoch is to handle the GPU builds as strict cross-compiling runtimes. The first step required https://github.com/llvm/llvm-project/pull/81557 to allow the `LIBC` target to build for the GPU without touching the other targets. This means that the GPU uses all the same handling as the other builds in `libc`. The new expected way to build the GPU libc is with `LLVM_LIBC_RUNTIME_TARGETS=amdgcn-amd-amdhsa;nvptx64-nvidia-cuda`. The second step was reworking how we generated the embedded GPU library by moving it into the library install step. Where we previously had one `libcgpu.a` we now have `libcgpu-amdgpu.a` and `libcgpu-nvptx.a`. This patch includes the necessary clang / OpenMP changes to make that not break the bots when this lands. We unfortunately still require that the NVPTX target has an `internal` target for tests. This is because the NVPTX target needs to do LTO for the provided version (The offloading toolchain can handle it) but cannot use it for the native toolchain which is used for making tests. This approach is vastly superior in every way, allowing us to treat the GPU as a standard cross-compiling target. We can now install the GPU utilities to do things like use the offload tests and other fun things. Some certain utilities need to be built with `--target=${LLVM_HOST_TRIPLE}` as well. I think this is a fine workaround as we will always assume that the GPU `libc` is a cross-build with a functioning host. Depends on https://github.com/llvm/llvm-project/pull/81557 --- clang/lib/Driver/ToolChains/CommonArgs.cpp | 37 +- clang/test/Driver/openmp-offload-gpu.c | 20 +- libc/CMakeLists.txt | 20 +- .../cmake/modules/LLVMLibCArchitectures.cmake | 28 +- libc/cmake/modules/LLVMLibCCheckMPFR.cmake | 2 +- .../modules/LLVMLibCCompileOptionRules.cmake | 76 +--- libc/cmake/modules/LLVMLibCHeaderRules.cmake | 2 +- libc/cmake/modules/LLVMLibCLibraryRules.cmake | 141 +++++-- libc/cmake/modules/LLVMLibCObjectRules.cmake | 348 ++++-------------- libc/cmake/modules/LLVMLibCTestRules.cmake | 47 ++- .../modules/prepare_libc_gpu_build.cmake | 108 ++---- libc/docs/gpu/using.rst | 33 +- libc/include/CMakeLists.txt | 6 +- libc/lib/CMakeLists.txt | 35 +- libc/src/__support/File/CMakeLists.txt | 2 +- libc/src/__support/GPU/CMakeLists.txt | 2 +- libc/src/__support/OSUtil/CMakeLists.txt | 2 +- libc/src/__support/RPC/CMakeLists.txt | 2 +- libc/src/math/CMakeLists.txt | 16 +- libc/src/math/gpu/vendor/CMakeLists.txt | 1 - libc/src/stdio/CMakeLists.txt | 2 +- libc/src/stdlib/CMakeLists.txt | 4 +- libc/src/string/CMakeLists.txt | 12 +- libc/startup/gpu/CMakeLists.txt | 35 +- libc/startup/gpu/amdgpu/CMakeLists.txt | 13 - libc/startup/gpu/nvptx/CMakeLists.txt | 9 - libc/test/CMakeLists.txt | 6 +- libc/test/IntegrationTest/CMakeLists.txt | 16 - libc/test/UnitTest/CMakeLists.txt | 2 +- libc/test/src/__support/CMakeLists.txt | 49 +-- libc/test/src/__support/CPP/CMakeLists.txt | 2 +- libc/test/src/__support/File/CMakeLists.txt | 2 +- libc/test/src/errno/CMakeLists.txt | 2 +- libc/test/src/math/CMakeLists.txt | 20 +- libc/test/src/math/smoke/CMakeLists.txt | 8 +- libc/test/src/stdio/CMakeLists.txt | 2 +- libc/test/src/stdlib/CMakeLists.txt | 6 +- libc/test/utils/UnitTest/CMakeLists.txt | 2 +- libc/utils/CMakeLists.txt | 2 +- libc/utils/MPFRWrapper/CMakeLists.txt | 2 +- libc/utils/gpu/CMakeLists.txt | 4 +- libc/utils/gpu/loader/CMakeLists.txt | 48 ++- libc/utils/gpu/loader/amdgpu/CMakeLists.txt | 6 +- libc/utils/gpu/loader/nvptx/CMakeLists.txt | 10 +- libc/utils/gpu/server/CMakeLists.txt | 9 + llvm/CMakeLists.txt | 4 +- llvm/cmake/modules/HandleLLVMOptions.cmake | 7 + llvm/runtimes/CMakeLists.txt | 11 +- openmp/libomptarget/CMakeLists.txt | 9 +- .../plugins-nextgen/common/CMakeLists.txt | 6 +- .../plugins-nextgen/common/src/RPC.cpp | 3 +- openmp/libomptarget/test/lit.cfg | 8 +- 52 files changed, 585 insertions(+), 664 deletions(-) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index e5196bd8b5ae9..347b250260c4c 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1087,10 +1087,41 @@ static void addOpenMPDeviceLibC(const ToolChain &TC, const ArgList &Args, "llvm-libc-decls"); bool HasLibC = llvm::sys::fs::exists(LibCDecls) && llvm::sys::fs::is_directory(LibCDecls); - if (Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC)) { - CmdArgs.push_back("-lcgpu"); - CmdArgs.push_back("-lmgpu"); + if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC)) + return; + + // We don't have access to the offloading toolchains here, so determine from + // the arguments if we have any active NVPTX or AMDGPU toolchains. + llvm::DenseSet Libraries; + if (const Arg *Targets = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) { + if (llvm::any_of(Targets->getValues(), + [](auto S) { return llvm::Triple(S).isAMDGPU(); })) { + Libraries.insert("-lcgpu-amdgpu"); + Libraries.insert("-lmgpu-amdgpu"); + } + if (llvm::any_of(Targets->getValues(), + [](auto S) { return llvm::Triple(S).isNVPTX(); })) { + Libraries.insert("-lcgpu-nvptx"); + Libraries.insert("-lmgpu-nvptx"); + } } + + for (StringRef Arch : Args.getAllArgValues(options::OPT_offload_arch_EQ)) { + if (llvm::any_of(llvm::split(Arch, ","), [](StringRef Str) { + return IsAMDGpuArch(StringToCudaArch(Str)); + })) { + Libraries.insert("-lcgpu-amdgpu"); + Libraries.insert("-lmgpu-amdgpu"); + } + if (llvm::any_of(llvm::split(Arch, ","), [](StringRef Str) { + return IsNVIDIAGpuArch(StringToCudaArch(Str)); + })) { + Libraries.insert("-lcgpu-nvptx"); + Libraries.insert("-lmgpu-nvptx"); + } + } + + llvm::append_range(CmdArgs, Libraries); } void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC, diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c index bccc5fd9483ac..5da74a35d87ad 100644 --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -393,14 +393,28 @@ // // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \ // RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \ +// RUN: --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ -// RUN: --offload-arch=sm_52 -gpulibc -nogpuinc %s 2>&1 \ +// RUN: --rocm-path=%S/Inputs/rocm \ +// RUN: --offload-arch=sm_52,gfx803 -gpulibc -nogpuinc %s 2>&1 \ // RUN: | FileCheck --check-prefix=LIBC-GPU %s -// LIBC-GPU: "-lcgpu"{{.*}}"-lmgpu" +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \ +// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \ +// RUN: --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \ +// RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ +// RUN: --rocm-path=%S/Inputs/rocm \ +// RUN: -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 \ +// RUN: -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -gpulibc -nogpuinc %s 2>&1 \ +// RUN: | FileCheck --check-prefix=LIBC-GPU %s +// LIBC-GPU-DAG: "-lcgpu-amdgpu" +// LIBC-GPU-DAG: "-lmgpu-amdgpu" +// LIBC-GPU-DAG: "-lcgpu-nvptx" +// LIBC-GPU-DAG: "-lmgpu-nvptx" // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \ // RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \ // RUN: --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \ // RUN: --offload-arch=sm_52 -nogpulibc -nogpuinc %s 2>&1 \ // RUN: | FileCheck --check-prefix=NO-LIBC-GPU %s -// NO-LIBC-GPU-NOT: "-lcgpu"{{.*}}"-lmgpu" +// NO-LIBC-GPU-NOT: -lmgpu{{.*}}-lcgpu diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 9f9839423499e..6a57fcec26e47 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -43,7 +43,7 @@ set(LIBC_NAMESPACE "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LL CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'." ) -if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES) +if(LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD) if(NOT LIBC_HDRGEN_EXE) # We need to set up hdrgen first since other targets depend on it. add_subdirectory(utils/LibcTableGenUtil) @@ -77,7 +77,7 @@ if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN) # to build libc-hdrgen and return. # Always make the RPC server availible to other projects for GPU mode. - if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES) + if(LLVM_LIBC_GPU_BUILD) add_subdirectory(utils/gpu/server) endif() return() @@ -118,7 +118,7 @@ if(COMMAND_RETURN_CODE EQUAL 0) message(STATUS "Set COMPILER_RESOURCE_DIR to " "${COMPILER_RESOURCE_DIR} using --print-resource-dir") else() - if (LIBC_TARGET_ARCHITECTURE_IS_GPU) + if (LIBC_TARGET_OS_IS_GPU) message(FATAL_ERROR "COMPILER_RESOURCE_DIR must be set for GPU builds") else() set(COMPILER_RESOURCE_DIR OFF) @@ -216,11 +216,7 @@ foreach(config_path IN LISTS LIBC_CONFIG_JSON_FILE_LIST) load_libc_config(${config_path}/config.json ${cmd_line_conf}) endforeach() -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - set(LIBC_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) - set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm) - set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) -elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG) +if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG) set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE}) set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) @@ -235,7 +231,11 @@ else() set(LIBC_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include) set(LIBC_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}) endif() - set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}) + if(LIBC_TARGET_OS_IS_GPU) + set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) + else() + set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}) + endif() endif() if(LIBC_TARGET_TRIPLE) @@ -247,7 +247,7 @@ else() set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}) endif() -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) include(prepare_libc_gpu_build) set(LIBC_ENABLE_UNITTESTS OFF) endif() diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake index 623ed774be727..0dbc59ad643ac 100644 --- a/libc/cmake/modules/LLVMLibCArchitectures.cmake +++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake @@ -6,18 +6,6 @@ # platform. # ------------------------------------------------------------------------------ -if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES) - # We set the generic target and OS to "gpu" here. More specific defintions - # for the exact target GPU are set up in prepare_libc_gpu_build.cmake. - set(LIBC_TARGET_OS "gpu") - set(LIBC_TARGET_ARCHITECTURE_IS_GPU TRUE) - set(LIBC_TARGET_ARCHITECTURE "gpu") - if(LIBC_TARGET_TRIPLE) - message(WARNING "LIBC_TARGET_TRIPLE is ignored as LIBC_GPU_BUILD is on. ") - endif() - return() -endif() - if(MSVC) # If the compiler is visual c++ or equivalent, we will assume a host build. set(LIBC_TARGET_OS ${CMAKE_HOST_SYSTEM_NAME}) @@ -59,6 +47,10 @@ function(get_arch_and_system_from_triple triple arch_var sys_var) set(target_arch "riscv32") elseif(target_arch MATCHES "^riscv64") set(target_arch "riscv64") + elseif(target_arch MATCHES "^amdgcn") + set(target_arch "amdgpu") + elseif(target_arch MATCHES "^nvptx64") + set(target_arch "nvptx") else() return() endif() @@ -75,6 +67,12 @@ function(get_arch_and_system_from_triple triple arch_var sys_var) set(target_sys "darwin") endif() + # Setting OS name for GPU architectures. + list(GET triple_comps -1 gpu_target_sys) + if(gpu_target_sys MATCHES "^amdhsa" OR gpu_target_sys MATCHES "^cuda") + set(target_sys "gpu") + endif() + set(${sys_var} ${target_sys} PARENT_SCOPE) endfunction(get_arch_and_system_from_triple) @@ -156,6 +154,10 @@ elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv64") elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv32") set(LIBC_TARGET_ARCHITECTURE_IS_RISCV32 TRUE) set(LIBC_TARGET_ARCHITECTURE "riscv") +elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "amdgpu") + set(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU TRUE) +elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "nvptx") + set(LIBC_TARGET_ARCHITECTURE_IS_NVPTX TRUE) else() message(FATAL_ERROR "Unsupported libc target architecture ${LIBC_TARGET_ARCHITECTURE}") @@ -178,6 +180,8 @@ elseif(LIBC_TARGET_OS STREQUAL "darwin") set(LIBC_TARGET_OS_IS_DARWIN TRUE) elseif(LIBC_TARGET_OS STREQUAL "windows") set(LIBC_TARGET_OS_IS_WINDOWS TRUE) +elseif(LIBC_TARGET_OS STREQUAL "gpu") + set(LIBC_TARGET_OS_IS_GPU TRUE) else() message(FATAL_ERROR "Unsupported libc target operating system ${LIBC_TARGET_OS}") diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake index 9e361f5fd8112..bbaeb9f0dc053 100644 --- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake +++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake @@ -2,7 +2,7 @@ set(LLVM_LIBC_MPFR_INSTALL_PATH "" CACHE PATH "Path to where MPFR is installed ( if(LLVM_LIBC_MPFR_INSTALL_PATH) set(LIBC_TESTS_CAN_USE_MPFR TRUE) -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) set(LIBC_TESTS_CAN_USE_MPFR FALSE) else() try_compile( diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 140e4d51a9c2e..33ba5da4f8d57 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -82,10 +82,22 @@ function(_get_common_compile_options output_var flags) list(APPEND compile_options "/EHs-c-") list(APPEND compile_options "/GR-") endif() - if (LIBC_TARGET_ARCHITECTURE_IS_GPU) + if (LIBC_TARGET_OS_IS_GPU) list(APPEND compile_options "-nogpulib") list(APPEND compile_options "-fvisibility=hidden") list(APPEND compile_options "-fconvergent-functions") + list(APPEND compile_options "-flto") + + if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + list(APPEND compile_options "-Wno-unknown-cuda-version") + list(APPEND compile_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false") + list(APPEND compile_options "--cuda-feature=+ptx63") + if(LIBC_CUDA_ROOT) + list(APPEND compile_options "--cuda-path=${LIBC_CUDA_ROOT}") + endif() + elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none") + endif() # Manually disable all standard include paths and include the resource # directory to prevent system headers from being included. @@ -138,73 +150,21 @@ function(_get_common_test_compile_options output_var flags) set(${output_var} ${compile_options} PARENT_SCOPE) endfunction() -# Obtains NVPTX specific arguments for compilation. -# The PTX feature is primarily based on the CUDA toolchain version. We want to -# be able to target NVPTX without an existing CUDA installation, so we need to -# set this manually. This simply sets the PTX feature to the minimum required -# for the features we wish to use on that target. The minimum PTX features used -# here roughly corresponds to the CUDA 9.0 release. -# Adjust as needed for desired PTX features. -function(get_nvptx_compile_options output_var gpu_arch) - set(nvptx_options "") - list(APPEND nvptx_options "-march=${gpu_arch}") - list(APPEND nvptx_options "-Wno-unknown-cuda-version") - list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false") - if(${gpu_arch} STREQUAL "sm_35") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_37") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_50") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_52") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_53") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_60") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_61") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_62") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_70") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_72") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_75") - list(APPEND nvptx_options "--cuda-feature=+ptx63") - elseif(${gpu_arch} STREQUAL "sm_80") - list(APPEND nvptx_options "--cuda-feature=+ptx72") - elseif(${gpu_arch} STREQUAL "sm_86") - list(APPEND nvptx_options "--cuda-feature=+ptx72") - elseif(${gpu_arch} STREQUAL "sm_89") - list(APPEND nvptx_options "--cuda-feature=+ptx72") - elseif(${gpu_arch} STREQUAL "sm_90") - list(APPEND nvptx_options "--cuda-feature=+ptx72") - else() - message(FATAL_ERROR "Unknown Nvidia GPU architecture '${gpu_arch}'") - endif() - - if(LIBC_CUDA_ROOT) - list(APPEND nvptx_options "--cuda-path=${LIBC_CUDA_ROOT}") - endif() - set(${output_var} ${nvptx_options} PARENT_SCOPE) -endfunction() - function(_get_hermetic_test_compile_options output_var flags) _get_compile_options_from_flags(compile_flags ${flags}) list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags} ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti) # The GPU build requires overriding the default CMake triple and architecture. - if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) + if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) list(APPEND compile_options -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto - --target=${LIBC_GPU_TARGET_TRIPLE} -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}) - elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) - get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE}) + elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) list(APPEND compile_options - -nogpulib ${nvptx_options} -fno-use-cxa-atexit --target=${LIBC_GPU_TARGET_TRIPLE}) + "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false" + --cuda-path=${LIBC_CUDA_ROOT} + -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit) endif() set(${output_var} ${compile_options} PARENT_SCOPE) endfunction() diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake index 9e9b598721ab3..19515b1cbcc18 100644 --- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake +++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake @@ -139,7 +139,7 @@ function(add_gen_header target_name) ${hdrgen_deps} ) - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) + if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls) set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path}) add_custom_command( diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake index 81c207ec23176..f15ffd5f9c218 100644 --- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake +++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake @@ -50,31 +50,9 @@ function(collect_object_file_deps target result) endif() endfunction(collect_object_file_deps) -# A rule to build a library from a collection of entrypoint objects. -# Usage: -# add_entrypoint_library( -# DEPENDS -# ) -# -# NOTE: If one wants an entrypoint to be available in a library, then they will -# have to list the entrypoint target explicitly in the DEPENDS list. Implicit -# entrypoint dependencies will not be added to the library. -function(add_entrypoint_library target_name) - cmake_parse_arguments( - "ENTRYPOINT_LIBRARY" - "" # No optional arguments - "" # No single value arguments - "DEPENDS" # Multi-value arguments - ${ARGN} - ) - if(NOT ENTRYPOINT_LIBRARY_DEPENDS) - message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list " - "of 'add_entrypoint_object' targets.") - endif() - - get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS}) +function(get_all_object_file_deps result fq_deps_list) set(all_deps "") - foreach(dep IN LISTS fq_deps_list) + foreach(dep ${fq_deps_list}) get_target_property(dep_type ${dep} "TARGET_TYPE") if(NOT ((${dep_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE}) OR (${dep_type} STREQUAL ${ENTRYPOINT_EXT_TARGET_TYPE}) OR @@ -102,6 +80,121 @@ function(add_entrypoint_library target_name) list(APPEND all_deps ${entrypoint_target}) endforeach(dep) list(REMOVE_DUPLICATES all_deps) + set(${result} ${all_deps} PARENT_SCOPE) +endfunction() + +# A rule to build a library from a collection of entrypoint objects and bundle +# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'. +# Usage: +# add_gpu_entrypoint_library( +# DEPENDS +# ) +function(add_gpu_entrypoint_library target_name) + cmake_parse_arguments( + "ENTRYPOINT_LIBRARY" + "" # No optional arguments + "" # No single value arguments + "DEPENDS" # Multi-value arguments + ${ARGN} + ) + if(NOT ENTRYPOINT_LIBRARY_DEPENDS) + message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list " + "of 'add_entrypoint_object' targets.") + endif() + + get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS}) + get_all_object_file_deps(all_deps "${fq_deps_list}") + + # The GPU 'libc' needs to be exported in a format that can be linked with + # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a + # fat binary and adds them to a static library. + set(objects "") + foreach(dep IN LISTS all_deps) + set(object $<$,${dep}>:$>) + string(FIND ${dep} "." last_dot_loc REVERSE) + math(EXPR name_loc "${last_dot_loc} + 1") + string(SUBSTRING ${dep} ${name_loc} -1 name) + if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63) + elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa) + endif() + + # Use the 'clang-offload-packager' to merge these files into a binary blob. + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin" + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary + COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER} + "${prefix},file=$" -o + ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin + DEPENDS ${dep} + COMMENT "Packaging LLVM offloading binary for '${object}'" + ) + add_custom_target(${dep}.__gpubin__ DEPENDS ${dep} + "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin") + + # CMake does not permit setting the name on object files. In order to have + # human readable names we create an empty stub file with the entrypoint + # name. This empty file will then have the created binary blob embedded. + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp" + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp + DEPENDS ${dep} ${dep}.__gpubin__ + ) + add_custom_target(${dep}.__stub__ + DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp") + + add_library(${dep}.__fatbin__ + EXCLUDE_FROM_ALL OBJECT + "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp" + ) + + # This is always compiled for the LLVM host triple instead of the native GPU + # triple that is used by default in the build. + target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib) + target_compile_options(${dep}.__fatbin__ PRIVATE + --target=${LLVM_HOST_TRIPLE} + "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin") + add_dependencies(${dep}.__fatbin__ ${dep} ${dep}.__stub__ ${dep}.__gpubin__) + + # Set the list of newly create fat binaries containing embedded device code. + list(APPEND objects $) + endforeach() + + add_library( + ${target_name} + STATIC + ${objects} + ) + set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}) +endfunction(add_gpu_entrypoint_library) + +# A rule to build a library from a collection of entrypoint objects. +# Usage: +# add_entrypoint_library( +# DEPENDS +# ) +# +# NOTE: If one wants an entrypoint to be available in a library, then they will +# have to list the entrypoint target explicitly in the DEPENDS list. Implicit +# entrypoint dependencies will not be added to the library. +function(add_entrypoint_library target_name) + cmake_parse_arguments( + "ENTRYPOINT_LIBRARY" + "" # No optional arguments + "" # No single value arguments + "DEPENDS" # Multi-value arguments + ${ARGN} + ) + if(NOT ENTRYPOINT_LIBRARY_DEPENDS) + message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list " + "of 'add_entrypoint_object' targets.") + endif() + + get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS}) + get_all_object_file_deps(all_deps "${fq_deps_list}") + set(objects "") foreach(dep IN LISTS all_deps) list(APPEND objects $<$,${dep}>:$>) diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake index 308ba7d0d5dd7..78536f4eec55a 100644 --- a/libc/cmake/modules/LLVMLibCObjectRules.cmake +++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake @@ -1,175 +1,5 @@ set(OBJECT_LIBRARY_TARGET_TYPE "OBJECT_LIBRARY") -# Build the object target for a single GPU arch. -# Usage: -# _build_gpu_object_for_single_arch( -# -# -# SRCS -# HDRS -# DEPENDS -# COMPILE_OPTIONS -# FLAGS -# ) -function(_build_gpu_object_for_single_arch fq_target_name gpu_arch) - cmake_parse_arguments( - "ADD_GPU_OBJ" - "" # No optional arguments - "NAME;CXX_STANDARD" # Single value arguments - "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS" # Multi value arguments - ${ARGN} - ) - - if(NOT ADD_GPU_OBJ_CXX_STANDARD) - set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD}) - endif() - - set(compile_options ${ADD_GPU_OBJ_COMPILE_OPTIONS}) - # Derive the triple from the specified architecture. - if("${gpu_arch}" IN_LIST all_amdgpu_architectures) - set(gpu_target_triple ${AMDGPU_TARGET_TRIPLE}) - list(APPEND compile_options "-mcpu=${gpu_arch}") - list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none") - list(APPEND compile_options "-emit-llvm") - elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) - set(gpu_target_triple ${NVPTX_TARGET_TRIPLE}) - get_nvptx_compile_options(nvptx_options ${gpu_arch}) - list(APPEND compile_options "${nvptx_options}") - else() - message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'") - endif() - list(APPEND compile_options "--target=${gpu_target_triple}") - - # Build the library for this target architecture. We always emit LLVM-IR for - # packaged GPU binaries. - add_library(${fq_target_name} - EXCLUDE_FROM_ALL - OBJECT - ${ADD_GPU_OBJ_SRCS} - ${ADD_GPU_OBJ_HDRS} - ) - - target_compile_options(${fq_target_name} PRIVATE ${compile_options}) - target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) - target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}) - set_target_properties(${fq_target_name} PROPERTIES CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD}) - if(ADD_GPU_OBJ_DEPENDS) - add_dependencies(${fq_target_name} ${ADD_GPU_OBJ_DEPENDS}) - set_target_properties(${fq_target_name} PROPERTIES DEPS "${ADD_GPU_OBJ_DEPENDS}") - endif() -endfunction(_build_gpu_object_for_single_arch) - -# Build the object target for the GPU. -# This compiles the target for all supported architectures and embeds it into -# host binary for installing. -# Usage: -# _build_gpu_object_bundle( -# -# SRCS -# HDRS -# DEPENDS -# COMPILE_OPTIONS -# FLAGS -# ) -function(_build_gpu_object_bundle fq_target_name) - cmake_parse_arguments( - "ADD_GPU_OBJ" - "" # No optional arguments - "NAME;CXX_STANDARD" # Single value arguments - "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS" # Multi value arguments - ${ARGN} - ) - - if(NOT ADD_GPU_OBJ_CXX_STANDARD) - set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD}) - endif() - - foreach(add_gpu_obj_src ${ADD_GPU_OBJ_SRCS}) - # The packaged version will be built for every target GPU architecture. We do - # this so we can support multiple accelerators on the same machine. - foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES}) - get_filename_component(src_name ${add_gpu_obj_src} NAME) - set(gpu_target_name ${fq_target_name}.${src_name}.${gpu_arch}) - - _build_gpu_object_for_single_arch( - ${gpu_target_name} - ${gpu_arch} - CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD} - HDRS ${ADD_GPU_OBJ_HDRS} - SRCS ${add_gpu_obj_src} - COMPILE_OPTIONS - ${ADD_GPU_OBJ_COMPILE_OPTIONS} - "-emit-llvm" - DEPENDS ${ADD_GPU_OBJ_DEPENDS} - ) - # Append this target to a list of images to package into a single binary. - set(input_file $) - if("${gpu_arch}" IN_LIST all_nvptx_architectures) - get_nvptx_compile_options(nvptx_options ${gpu_arch}) - string(REGEX MATCH "\\+ptx[0-9]+" nvptx_ptx_feature ${nvptx_options}) - list(APPEND packager_images - --image=file=${input_file},arch=${gpu_arch},triple=${NVPTX_TARGET_TRIPLE},feature=${nvptx_ptx_feature}) - else() - list(APPEND packager_images - --image=file=${input_file},arch=${gpu_arch},triple=${AMDGPU_TARGET_TRIPLE}) - endif() - list(APPEND gpu_target_objects ${input_file}) - endforeach() - - # After building the target for the desired GPUs we must package the output - # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for - # more information. - set(packaged_target_name ${fq_target_name}.${src_name}.__gpu__) - set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.${src_name}.gpubin) - - add_custom_command(OUTPUT ${packaged_output_name} - COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER} - ${packager_images} -o ${packaged_output_name} - DEPENDS ${gpu_target_objects} ${add_gpu_obj_src} ${ADD_GPU_OBJ_HDRS} - COMMENT "Packaging LLVM offloading binary") - add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name}) - list(APPEND packaged_gpu_names ${packaged_target_name}) - list(APPEND packaged_gpu_binaries ${packaged_output_name}) - endforeach() - - # We create an empty 'stub' file for the host to contain the embedded device - # code. This will be packaged into 'libcgpu.a'. - # TODO: In the future we will want to combine every architecture for a target - # into a single bitcode file and use that. For now we simply build for - # every single one and let the offloading linker handle it. - string(FIND ${fq_target_name} "." last_dot_loc REVERSE) - math(EXPR name_loc "${last_dot_loc} + 1") - string(SUBSTRING ${fq_target_name} ${name_loc} -1 target_name) - set(stub_filename "${target_name}.cpp") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}" - COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs/ - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename} - DEPENDS ${gpu_target_objects} ${ADD_GPU_OBJ_SRCS} ${ADD_GPU_OBJ_HDRS} - ) - set(stub_target_name ${fq_target_name}.__stub__) - add_custom_target(${stub_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}) - - add_library( - ${fq_target_name} - # We want an object library as the objects will eventually get packaged into - # an archive (like libcgpu.a). - EXCLUDE_FROM_ALL - OBJECT - ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename} - ) - target_compile_options(${fq_target_name} BEFORE PRIVATE - ${ADD_GPU_OBJ_COMPILE_OPTIONS} -nostdlib) - foreach(packaged_gpu_binary ${packaged_gpu_binaries}) - target_compile_options(${fq_target_name} PRIVATE - "SHELL:-Xclang -fembed-offload-object=${packaged_gpu_binary}") - endforeach() - target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) - target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}) - add_dependencies(${fq_target_name} - ${full_deps_list} ${packaged_gpu_names} ${stub_target_name}) -endfunction() - # Rule which is essentially a wrapper over add_library to compile a set of # sources to object files. # Usage: @@ -214,53 +44,37 @@ function(create_object_library fq_target_name) message(FATAL_ERROR "'add_object_library' rule requires SRCS to be specified.") endif() - # The GPU build uses a separate internal file. - if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND NOT ${ADD_OBJECT_NO_GPU_BUNDLE}) - set(internal_target_name ${fq_target_name}.__internal__) - set(public_packaging_for_internal "") - else() - set(internal_target_name ${fq_target_name}) - set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING") - endif() + set(internal_target_name ${fq_target_name}.__internal__) + set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING") _get_common_compile_options(compile_options "${ADD_OBJECT_FLAGS}") list(APPEND compile_options ${ADD_OBJECT_COMPILE_OPTIONS}) - # GPU builds require special handling for the objects because we want to - # export several different targets at once, e.g. for both Nvidia and AMD. - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - if(NOT ${ADD_OBJECT_NO_GPU_BUNDLE}) - _build_gpu_object_bundle( - ${fq_target_name} - SRCS ${ADD_OBJECT_SRCS} - HDRS ${ADD_OBJECT_HDRS} - CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD} - COMPILE_OPTIONS ${compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING" - DEPENDS ${fq_deps_list} - ) - endif() - # When the target for GPU is not bundled, internal_target_name is the same - # as fq_targetname - _build_gpu_object_for_single_arch( - ${internal_target_name} - ${LIBC_GPU_TARGET_ARCHITECTURE} - SRCS ${ADD_OBJECT_SRCS} - HDRS ${ADD_OBJECT_HDRS} - CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD} - COMPILE_OPTIONS ${compile_options} ${public_packaging_for_internal} - DEPENDS ${fq_deps_list} - ) - else() + add_library( + ${fq_target_name} + EXCLUDE_FROM_ALL + OBJECT + ${ADD_OBJECT_SRCS} + ${ADD_OBJECT_HDRS} + ) + target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) + target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}) + target_compile_options(${fq_target_name} PRIVATE ${compile_options}) + + # The NVPTX target is installed as LLVM-IR but the internal testing toolchain + # cannot handle it natively. Make a separate internal target for testing. + if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_GPU_TESTS_DISABLED) add_library( - ${fq_target_name} + ${internal_target_name} EXCLUDE_FROM_ALL OBJECT ${ADD_OBJECT_SRCS} ${ADD_OBJECT_HDRS} ) - target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) - target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}) - target_compile_options(${fq_target_name} PRIVATE ${compile_options}) + target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) + target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}) + target_compile_options(${internal_target_name} PRIVATE ${compile_options} + -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE}) endif() if(SHOW_INTERMEDIATE_OBJECTS) @@ -290,13 +104,18 @@ function(create_object_library fq_target_name) FLAGS "${ADD_OBJECT_FLAGS}" ) + # If we built a separate internal target we want to use those target objects + # for testing instead of the exported target. + set(target_objects ${fq_target_name}) if(TARGET ${internal_target_name}) - set_target_properties( - ${fq_target_name} - PROPERTIES - OBJECT_FILES "$" - ) + set(target_objects ${internal_target_name}) endif() + + set_target_properties( + ${fq_target_name} + PROPERTIES + OBJECT_FILES "$" + ) endfunction(create_object_library) function(add_object_library target_name) @@ -389,12 +208,19 @@ function(create_entrypoint_object fq_target_name) get_target_property(object_file ${fq_dep_name} "OBJECT_FILE") get_target_property(object_file_raw ${fq_dep_name} "OBJECT_FILE_RAW") - add_library( - ${internal_target_name} - EXCLUDE_FROM_ALL - OBJECT - ${object_file_raw} - ) + + # If the system cannot build the GPU tests we simply make a dummy target. + if(LIBC_TARGET_OS_IS_GPU AND LIBC_GPU_TESTS_DISABLED) + add_custom_target(${internal_target_name}) + else() + add_library( + ${internal_target_name} + EXCLUDE_FROM_ALL + OBJECT + ${object_file_raw} + ) + endif() + add_dependencies(${internal_target_name} ${fq_dep_name}) add_library( ${fq_target_name} @@ -441,60 +267,42 @@ function(create_entrypoint_object fq_target_name) endif() endif() - # GPU builds require special handling for the objects because we want to - # export several different targets at once, e.g. for both Nvidia and AMD. - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - _build_gpu_object_bundle( - ${fq_target_name} - SRCS ${ADD_ENTRYPOINT_OBJ_SRCS} - HDRS ${ADD_ENTRYPOINT_OBJ_HDRS} - COMPILE_OPTIONS ${common_compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING" - CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} - DEPENDS ${full_deps_list} - FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" - ) - _build_gpu_object_for_single_arch( - ${internal_target_name} - ${LIBC_GPU_TARGET_ARCHITECTURE} - SRCS ${ADD_ENTRYPOINT_OBJ_SRCS} - HDRS ${ADD_ENTRYPOINT_OBJ_HDRS} - COMPILE_OPTIONS ${common_compile_options} - CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD} - DEPENDS ${full_deps_list} - FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}" - ) - else() - add_library( - ${internal_target_name} - # TODO: We don't need an object library for internal consumption. - # A future change should switch this to a normal static library. - EXCLUDE_FROM_ALL - OBJECT - ${ADD_ENTRYPOINT_OBJ_SRCS} - ${ADD_ENTRYPOINT_OBJ_HDRS} - ) - target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options}) - target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) - target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}) - add_dependencies(${internal_target_name} ${full_deps_list}) - target_link_libraries(${internal_target_name} ${full_deps_list}) - - add_library( - ${fq_target_name} - # We want an object library as the objects will eventually get packaged into - # an archive (like libc.a). - EXCLUDE_FROM_ALL - OBJECT - ${ADD_ENTRYPOINT_OBJ_SRCS} - ${ADD_ENTRYPOINT_OBJ_HDRS} - ) - target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING) - target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) - target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}) - add_dependencies(${fq_target_name} ${full_deps_list}) - target_link_libraries(${fq_target_name} ${full_deps_list}) + add_library( + ${internal_target_name} + # TODO: We don't need an object library for internal consumption. + # A future change should switch this to a normal static library. + EXCLUDE_FROM_ALL + OBJECT + ${ADD_ENTRYPOINT_OBJ_SRCS} + ${ADD_ENTRYPOINT_OBJ_HDRS} + ) + target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options}) + target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) + target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}) + add_dependencies(${internal_target_name} ${full_deps_list}) + target_link_libraries(${internal_target_name} ${full_deps_list}) + + # The NVPTX target cannot use LTO for the internal targets used for testing. + if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + target_compile_options(${internal_target_name} PRIVATE + -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE}) endif() + add_library( + ${fq_target_name} + # We want an object library as the objects will eventually get packaged into + # an archive (like libc.a). + EXCLUDE_FROM_ALL + OBJECT + ${ADD_ENTRYPOINT_OBJ_SRCS} + ${ADD_ENTRYPOINT_OBJ_HDRS} + ) + target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING) + target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) + target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}) + add_dependencies(${fq_target_name} ${full_deps_list}) + target_link_libraries(${fq_target_name} ${full_deps_list}) + set_target_properties( ${fq_target_name} PROPERTIES diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 6ca9516ff7a0e..373cbd6853859 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -449,7 +449,7 @@ function(add_integration_test test_name) ${fq_build_target_name} EXCLUDE_FROM_ALL # The NVIDIA 'nvlink' linker does not currently support static libraries. - $<$:${link_object_files}> + $<$:${link_object_files}> ${INTEGRATION_TEST_SRCS} ${INTEGRATION_TEST_HDRS} ) @@ -461,8 +461,17 @@ function(add_integration_test test_name) _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}") target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static) + if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + target_link_options(${fq_build_target_name} PRIVATE + -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto + "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static + "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}") + elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + # We need to use the internal object versions for NVPTX. + set(internal_suffix ".__internal__") + target_link_options(${fq_build_target_name} PRIVATE + -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static + "--cuda-path=${LIBC_CUDA_ROOT}") elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP) target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static) else() @@ -474,9 +483,10 @@ function(add_integration_test test_name) target_link_libraries( ${fq_build_target_name} # The NVIDIA 'nvlink' linker does not currently support static libraries. - $<$>:${fq_target_name}.__libc__> - libc.startup.${LIBC_TARGET_OS}.crt1 - libc.test.IntegrationTest.test) + $<$>:${fq_target_name}.__libc__> + libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix} + libc.test.IntegrationTest.test${internal_suffix} + ) add_dependencies(${fq_build_target_name} libc.test.IntegrationTest.test ${INTEGRATION_TEST_DEPENDS}) @@ -495,7 +505,7 @@ function(add_integration_test test_name) # makes `add_custom_target` construct the correct command and execute it. set(test_cmd ${INTEGRATION_TEST_ENV} - $<$:${gpu_loader_exe}> + $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${INTEGRATION_TEST_LOADER_ARGS} $ ${INTEGRATION_TEST_ARGS}) @@ -606,7 +616,7 @@ function(add_libc_hermetic_test test_name) ${fq_build_target_name} EXCLUDE_FROM_ALL # The NVIDIA 'nvlink' linker does not currently support static libraries. - $<$:${link_object_files}> + $<$:${link_object_files}> ${HERMETIC_TEST_SRCS} ${HERMETIC_TEST_HDRS} ) @@ -615,6 +625,8 @@ function(add_libc_hermetic_test test_name) RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} #OUTPUT_NAME ${fq_target_name} ) + + _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}") target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}) _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}") @@ -629,8 +641,17 @@ function(add_libc_hermetic_test test_name) endif() endforeach() - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static) + if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + target_link_options(${fq_build_target_name} PRIVATE + -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto + "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static + "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}") + elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + # We need to use the internal object versions for NVPTX. + set(internal_suffix ".__internal__") + target_link_options(${fq_build_target_name} PRIVATE + -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static + "--cuda-path=${LIBC_CUDA_ROOT}") elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP) target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static) else() @@ -642,12 +663,12 @@ function(add_libc_hermetic_test test_name) target_link_libraries( ${fq_build_target_name} PRIVATE - libc.startup.${LIBC_TARGET_OS}.crt1 + libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix} ${link_libraries} LibcTest.hermetic LibcHermeticTestSupport.hermetic # The NVIDIA 'nvlink' linker does not currently support static libraries. - $<$>:${fq_target_name}.__libc__>) + $<$>:${fq_target_name}.__libc__>) add_dependencies(${fq_build_target_name} LibcTest.hermetic libc.test.UnitTest.ErrnoSetterMatcher @@ -660,7 +681,7 @@ function(add_libc_hermetic_test test_name) endif() set(test_cmd ${HERMETIC_TEST_ENV} - $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS} + $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS} $ ${HERMETIC_TEST_ARGS}) add_custom_target( ${fq_target_name} diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake index 2086175bae6c7..75beef86760c8 100644 --- a/libc/cmake/modules/prepare_libc_gpu_build.cmake +++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake @@ -1,23 +1,8 @@ -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) message(FATAL_ERROR "libc build: Invalid attempt to set up GPU architectures.") endif() -# Set up the target architectures to build the GPU libc for. -set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942" - "gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034" - "gfx1035;gfx1036" - "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151") -set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" - "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90") -set(all_gpu_architectures - "${all_amdgpu_architectures};${all_nvptx_architectures}") -set(LIBC_GPU_ARCHITECTURES "all" CACHE STRING - "List of GPU architectures to build the libc for.") -set(AMDGPU_TARGET_TRIPLE "amdgcn-amd-amdhsa") -set(NVPTX_TARGET_TRIPLE "nvptx64-nvidia-cuda") - # Ensure the compiler is a valid clang when building the GPU target. set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}") if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND @@ -31,40 +16,6 @@ if(NOT LLVM_LIBC_FULL_BUILD) "GPU.") endif() -# Identify any locally installed AMD GPUs on the system using 'amdgpu-arch'. -find_program(LIBC_AMDGPU_ARCH - NAMES amdgpu-arch NO_DEFAULT_PATH - PATHS ${LLVM_BINARY_DIR}/bin /opt/rocm/llvm/bin/) - -# Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'. -find_program(LIBC_NVPTX_ARCH - NAMES nvptx-arch NO_DEFAULT_PATH - PATHS ${LLVM_BINARY_DIR}/bin) - -# Get the list of all natively supported GPU architectures. -set(detected_gpu_architectures "") -foreach(arch_tool ${LIBC_NVPTX_ARCH} ${LIBC_AMDGPU_ARCH}) - if(arch_tool) - execute_process(COMMAND ${arch_tool} - OUTPUT_VARIABLE arch_tool_output - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - string(REPLACE "\n" ";" arch_list "${arch_tool_output}") - list(APPEND detected_gpu_architectures "${arch_list}") - endif() -endforeach() -list(REMOVE_DUPLICATES detected_gpu_architectures) - -if(LIBC_GPU_ARCHITECTURES STREQUAL "all") - set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures}) -elseif(LIBC_GPU_ARCHITECTURES STREQUAL "native") - if(NOT detected_gpu_architectures) - message(FATAL_ERROR "No GPUs found on the system when using 'native'") - endif() - set(LIBC_GPU_ARCHITECTURES ${detected_gpu_architectures}) -endif() -message(STATUS "Building libc for the following GPU architecture(s): " - "${LIBC_GPU_ARCHITECTURES}") - # Identify the program used to package multiple images into a single binary. find_program(LIBC_CLANG_OFFLOAD_PACKAGER NAMES clang-offload-packager NO_DEFAULT_PATH @@ -87,49 +38,54 @@ else() endif() set(LIBC_GPU_TEST_ARCHITECTURE "" CACHE STRING "Architecture for the GPU tests") +if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + check_cxx_compiler_flag("-nogpulib -mcpu=native" PLATFORM_HAS_GPU) +elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + # Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'. + # Using 'check_cxx_compiler_flag' does not work currently due to the link job. + find_program(LIBC_NVPTX_ARCH + NAMES nvptx-arch NO_DEFAULT_PATH + PATHS ${LLVM_BINARY_DIR}/bin) + if(LIBC_NVPTX_ARCH) + execute_process(COMMAND ${LIBC_NVPTX_ARCH} + OUTPUT_VARIABLE arch_tool_output + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(arch_tool_output MATCHES "^sm_[0-9]+") + set(PLATFORM_HAS_GPU TRUE) + endif() + endif() +endif() set(gpu_test_architecture "") if(LIBC_GPU_TEST_ARCHITECTURE) + set(LIBC_GPU_TESTS_DISABLED FALSE) set(gpu_test_architecture ${LIBC_GPU_TEST_ARCHITECTURE}) message(STATUS "Using user-specified GPU architecture for testing: " "'${gpu_test_architecture}'") -elseif(detected_gpu_architectures) - list(GET detected_gpu_architectures 0 gpu_test_architecture) +elseif(PLATFORM_HAS_GPU) + set(LIBC_GPU_TESTS_DISABLED FALSE) + set(gpu_test_architecture "native") message(STATUS "Using GPU architecture detected on the system for testing: " - "'${gpu_test_architecture}'") + "'native'") else() - list(LENGTH LIBC_GPU_ARCHITECTURES n_gpu_archs) - if (${n_gpu_archs} EQUAL 1) - set(gpu_test_architecture ${LIBC_GPU_ARCHITECTURES}) - message(STATUS "Using user-specified GPU architecture for testing: " - "'${gpu_test_architecture}'") - else() - message(STATUS "No GPU architecture set for testing. GPU tests will not be " - "availibe. Set 'LIBC_GPU_TEST_ARCHITECTURE' to override.") - return() - endif() + set(LIBC_GPU_TESTS_DISABLED TRUE) + message(STATUS "No GPU architecture detected or provided, tests will not be " + "built") endif() +set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}") -if("${gpu_test_architecture}" IN_LIST all_amdgpu_architectures) - set(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU TRUE) - set(LIBC_GPU_TARGET_TRIPLE ${AMDGPU_TARGET_TRIPLE}) - set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}") -elseif("${gpu_test_architecture}" IN_LIST all_nvptx_architectures) - set(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX TRUE) - set(LIBC_GPU_TARGET_TRIPLE ${NVPTX_TARGET_TRIPLE}) - set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}") -else() - message(FATAL_ERROR "Unknown GPU architecture '${gpu_test_architecture}'") -endif() +if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + # FIXME: This is a hack required to keep the CUDA package from trying to find + # pthreads. We only link the CUDA driver, so this is unneeded. + add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) -if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) find_package(CUDAToolkit QUIET) if(CUDAToolkit_FOUND) get_filename_component(LIBC_CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE) endif() endif() -if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) +if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) # The AMDGPU environment uses different code objects to encode the ABI for # kernel calls and intrinsic functions. We want to specify this manually to # conform to whatever the test suite was built to handle. diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst index 71f5e7ba20393..79b9116c38ed2 100644 --- a/libc/docs/gpu/using.rst +++ b/libc/docs/gpu/using.rst @@ -14,25 +14,25 @@ Building the GPU library LLVM's libc GPU support *must* be built with an up-to-date ``clang`` compiler due to heavy reliance on ``clang``'s GPU support. This can be done automatically -using the ``LLVM_ENABLE_RUNTIMES=libc`` option. To enable libc for the GPU, -enable the ``LIBC_GPU_BUILD`` option. By default, ``libcgpu.a`` will be built -using every supported GPU architecture. To restrict the number of architectures -build, either set ``LIBC_GPU_ARCHITECTURES`` to the list of desired -architectures manually or use ``native`` to detect the GPUs on your system. A -typical ``cmake`` configuration will look like this: +using the LLVM runtimes support. The GPU build is done using cross-compilation +to the GPU architecture. This project currently supports AMD and NVIDIA GPUs +which can be targeted using the appropriate target name. The following +invocation will enable a cross-compiling build for the GPU architecture and +enable the ``libc`` project only for them. .. code-block:: sh $> cd llvm-project # The llvm-project checkout $> mkdir build $> cd build - $> cmake ../llvm -G Ninja \ - -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" \ - -DLLVM_ENABLE_RUNTIMES="libc;openmp" \ + $> cmake ../llvm -G Ninja \ + -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" \ + -DLLVM_ENABLE_RUNTIMES="openmp" \ -DCMAKE_BUILD_TYPE= \ # Select build type - -DLIBC_GPU_BUILD=ON \ # Build in GPU mode - -DLIBC_GPU_ARCHITECTURES=all \ # Build all supported architectures - -DCMAKE_INSTALL_PREFIX= \ # Where 'libcgpu.a' will live + -DCMAKE_INSTALL_PREFIX= \ # Where 'libcgpu.a' will live + -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc \ + -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=libc \ + -DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda $> ninja install Since we want to include ``clang``, ``lld`` and ``compiler-rt`` in our @@ -40,13 +40,14 @@ toolchain, we list them in ``LLVM_ENABLE_PROJECTS``. To ensure ``libc`` is built using a compatible compiler and to support ``openmp`` offloading, we list them in ``LLVM_ENABLE_RUNTIMES`` to build them after the enabled projects using the newly built compiler. ``CMAKE_INSTALL_PREFIX`` specifies the installation -directory in which to install the ``libcgpu.a`` library and headers along with -LLVM. The generated headers will be placed in ``include/gpu-none-llvm``. +directory in which to install the ``libcgpu-nvptx.a`` and ``libcgpu-amdgpu.a`` +libraries and headers along with LLVM. The generated headers will be placed in +``include/``. Usage ===== -Once the ``libcgpu.a`` static archive has been built it can be linked directly +Once the static archive has been built it can be linked directly with offloading applications as a standard library. This process is described in the `clang documentation `_. This linking mode is used by the OpenMP toolchain, but is currently opt-in for @@ -68,7 +69,7 @@ supported target device. The supported architectures can be seen using LLVM's OFFLOADING IMAGE [0]: kind llvm ir - arch gfx90a + arch generic triple amdgcn-amd-amdhsa producer none diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index dc3c9b8e6328a..9090b3bca01e0 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -4,7 +4,7 @@ set(LIBC_INCLUDE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) include(LLVMLibCHeaderRules) # The GPU build wants to install files in the compiler's resource directory. -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) include(GetClangResourceDir) endif() @@ -586,7 +586,7 @@ add_gen_header( .llvm-libc-types.wchar_t ) -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) add_gen_header( @@ -638,7 +638,7 @@ foreach(target IN LISTS all_install_header_targets) # The GPU optionally provides the supported declarations externally so # offloading languages like CUDA and OpenMP know what is supported by libc. We # install these in the compiler's resource directory at a preset location. - if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND PACKAGE_VERSION) + if(LIBC_TARGET_OS_IS_GPU AND PACKAGE_VERSION) get_target_property(decls_file ${target} DECLS_FILE_PATH) if(NOT decls_file) continue() diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt index c1a804232c1f5..615f4270646fb 100644 --- a/libc/lib/CMakeLists.txt +++ b/libc/lib/CMakeLists.txt @@ -2,11 +2,7 @@ set(libc_archive_targets "") set(libc_archive_names "") set(libc_archive_entrypoint_lists "") if(LLVM_LIBC_FULL_BUILD) - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - list(APPEND libc_archive_names cgpu mgpu) - else() - list(APPEND libc_archive_names c m) - endif() + list(APPEND libc_archive_names c m) list(APPEND libc_archive_targets libc libm) list(APPEND libc_archive_entrypoint_lists TARGET_LIBC_ENTRYPOINTS TARGET_LIBM_ENTRYPOINTS) @@ -40,6 +36,27 @@ foreach(archive IN ZIP_LISTS endif() endif() list(APPEND added_archive_targets ${archive_1}) + + # Add the offloading version of the library for offloading languages. These + # are installed in the standard search path separate from the other libraries. + if(LIBC_TARGET_OS_IS_GPU) + set(libc_gpu_archive_target ${archive_1}gpu) + set(libc_gpu_archive_name ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE}) + + add_gpu_entrypoint_library( + ${libc_gpu_archive_target} + DEPENDS + ${${archive_2}} + ) + set_target_properties( + ${libc_gpu_archive_target} + PROPERTIES + ARCHIVE_OUTPUT_NAME ${libc_gpu_archive_name} + ) + set_target_properties(${libc_gpu_archive_target} PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}) + list(APPEND added_gpu_archive_targets ${libc_gpu_archive_target}) + endif() endforeach() install( @@ -48,6 +65,14 @@ install( COMPONENT libc ) +if(LIBC_TARGET_OS_IS_GPU) + install( + TARGETS ${added_gpu_archive_targets} + ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} + COMPONENT libc + ) +endif() + if(NOT LIBC_TARGET_OS_IS_BAREMETAL) # For now we will disable libc-startup installation for baremetal. The # correct way to do it would be to make a hookable startup for baremetal diff --git a/libc/src/__support/File/CMakeLists.txt b/libc/src/__support/File/CMakeLists.txt index b3e4cc4b02779..b7c0612096aa9 100644 --- a/libc/src/__support/File/CMakeLists.txt +++ b/libc/src/__support/File/CMakeLists.txt @@ -1,5 +1,5 @@ if(NOT (TARGET libc.src.__support.threads.mutex) - OR LIBC_TARGET_ARCHITECTURE_IS_GPU) + OR LIBC_TARGET_OS_IS_GPU) # Not all platforms have a mutex implementation. If mutex is unvailable, # we just skip everything about files. return() diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt index 5a899215f4b6e..d7ebd3cab7abe 100644 --- a/libc/src/__support/GPU/CMakeLists.txt +++ b/libc/src/__support/GPU/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) return() endif() diff --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt index c19677582643e..ca3b3bf1263e0 100644 --- a/libc/src/__support/OSUtil/CMakeLists.txt +++ b/libc/src/__support/OSUtil/CMakeLists.txt @@ -9,7 +9,7 @@ if(NOT TARGET ${target_os_util}) endif() # The OSUtil is an object library in GPU mode. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_header_library( osutil HDRS diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt index b44a65b3732e9..183fc6f8683e0 100644 --- a/libc/src/__support/RPC/CMakeLists.txt +++ b/libc/src/__support/RPC/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) return() endif() diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 05ce51e8fc650..33dc1fc97c568 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -1,6 +1,9 @@ add_subdirectory(generic) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) +elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + # TODO: We should split this into 'nvptx' and 'amdgpu' for the GPU build. + add_subdirectory(${LIBC_TARGET_OS}) endif() function(add_math_entrypoint_object name) @@ -8,6 +11,7 @@ function(add_math_entrypoint_object name) # that first and return early if we are able to add an alias target for the # machine specific implementation. get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.${name}" fq_machine_specific_target_name) + get_fq_target_name("${LIBC_TARGET_OS}.${name}" fq_os_specific_target_name) if(TARGET ${fq_machine_specific_target_name}) add_entrypoint_object( ${name} @@ -16,17 +20,25 @@ function(add_math_entrypoint_object name) .${LIBC_TARGET_ARCHITECTURE}.${name} ) return() + elseif(TARGET ${fq_os_specific_target_name}) + add_entrypoint_object( + ${name} + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.${name} + ) + return() endif() # The GPU optionally depends on vendor libraries. If we emitted one of these # entrypoints it means the user requested it and we should use it instead. - get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.vendor.${name}" fq_vendor_specific_target_name) + get_fq_target_name("${LIBC_TARGET_OS}.vendor.${name}" fq_vendor_specific_target_name) if(TARGET ${fq_vendor_specific_target_name}) add_entrypoint_object( ${name} ALIAS DEPENDS - .${LIBC_TARGET_ARCHITECTURE}.vendor.${name} + .${LIBC_TARGET_OS}.vendor.${name} VENDOR ) return() diff --git a/libc/src/math/gpu/vendor/CMakeLists.txt b/libc/src/math/gpu/vendor/CMakeLists.txt index f699ca103b5f8..36087ade63bfc 100644 --- a/libc/src/math/gpu/vendor/CMakeLists.txt +++ b/libc/src/math/gpu/vendor/CMakeLists.txt @@ -10,7 +10,6 @@ else() "functions will be an external reference to the vendor libraries.") endif() -find_package(CUDAToolkit QUIET) if(CUDAToolkit_FOUND) set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc) if (EXISTS ${libdevice_path}) diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index 380474ce27118..bb8e41606c5df 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -22,7 +22,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) endif() -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic) endif() diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index a4d51fb9a11ee..ce08635df3145 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -316,7 +316,7 @@ if(LLVM_LIBC_INCLUDE_SCUDO) DEPENDS ${SCUDO_DEPS} ) -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_entrypoint_external( calloc ) @@ -397,7 +397,7 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.abort ) -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) add_entrypoint_object( malloc ALIAS diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 6daaf1998ea7b..1c893280e8a3c 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -501,7 +501,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_bcmp(bcmp_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512BW) add_bcmp(bcmp_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_bcmp(bcmp) -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_bcmp(bcmp) else() add_bcmp(bcmp_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) @@ -530,7 +530,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_bzero(bzero_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) add_bzero(bzero_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_bzero(bzero) -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_bzero(bzero) else() add_bzero(bzero_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) @@ -562,7 +562,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) add_memcmp(memcmp_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memcmp(memcmp) -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_memcmp(memcmp) else() add_memcmp(memcmp_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) @@ -598,7 +598,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0") add_memcpy(memcpy MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0") -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_memcpy(memcpy) else() add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) @@ -632,7 +632,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) add_memmove(memmove_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0") add_memmove(memmove MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0") -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_memmove(memmove) else() add_memmove(memmove_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) @@ -667,7 +667,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0") add_memset(memset MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0") -elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(LIBC_TARGET_OS_IS_GPU) add_memset(memset) else() add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt index fa7f69f19520c..6f67fa9ff44f7 100644 --- a/libc/startup/gpu/CMakeLists.txt +++ b/libc/startup/gpu/CMakeLists.txt @@ -28,33 +28,24 @@ function(add_startup_object name) ) endfunction() -if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) - add_subdirectory(amdgpu) - - add_startup_object( - crt1 - ALIAS - DEPENDS - .amdgpu.crt1 - ) -elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) - add_subdirectory(nvptx) - - add_startup_object( - crt1 - ALIAS - DEPENDS - .nvptx.crt1 - ) -else() - # Skip building the startup code if there are no supported GPUs. - message(STATUS "Skipping startup for gpu target, no GPUs were detected") - return() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) + add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) endif() +add_startup_object( + crt1 + ALIAS + DEPENDS + .${LIBC_TARGET_ARCHITECTURE}.crt1 +) + add_custom_target(libc-startup) set(startup_components crt1) foreach(target IN LISTS startup_components) set(fq_target_name libc.startup.gpu.${target}) add_dependencies(libc-startup ${fq_target_name}) + install(FILES $ + DESTINATION ${LIBC_INSTALL_LIBRARY_DIR} + RENAME $ + COMPONENT libc) endforeach() diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt index c9d0ee2fd0e9a..3ac104ee8ba94 100644 --- a/libc/startup/gpu/amdgpu/CMakeLists.txt +++ b/libc/startup/gpu/amdgpu/CMakeLists.txt @@ -1,6 +1,5 @@ add_startup_object( crt1 - NO_GPU_BUNDLE # Compile this file directly without special GPU handling. SRC start.cpp DEPENDS @@ -11,17 +10,5 @@ add_startup_object( COMPILE_OPTIONS -ffreestanding # To avoid compiler warnings about calling the main function. -fno-builtin - -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION} # Manually set the ABI. ) get_fq_target_name(crt1 fq_name) - -# Ensure that clang uses the correct linker for this object type. -target_link_libraries( - ${fq_name} - PUBLIC - "-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}" - "--target=${LIBC_GPU_TARGET_TRIPLE}" - "-flto" - "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" - "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}" -) diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt index 23a54516cc982..3ac104ee8ba94 100644 --- a/libc/startup/gpu/nvptx/CMakeLists.txt +++ b/libc/startup/gpu/nvptx/CMakeLists.txt @@ -1,6 +1,5 @@ add_startup_object( crt1 - NO_GPU_BUNDLE # Compile this file directly without special GPU handling. SRC start.cpp DEPENDS @@ -13,11 +12,3 @@ add_startup_object( -fno-builtin ) get_fq_target_name(crt1 fq_name) - -# Ensure that clang uses the correct linker for this object type. -target_link_libraries(${fq_name} - PUBLIC - "-march=${LIBC_GPU_TARGET_ARCHITECTURE}" - "--target=${LIBC_GPU_TARGET_TRIPLE}" - "--cuda-path=${LIBC_CUDA_ROOT}" -) diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt index f22f2b183aca9..745a9a04b4af8 100644 --- a/libc/test/CMakeLists.txt +++ b/libc/test/CMakeLists.txt @@ -8,9 +8,9 @@ add_custom_target(libc-long-running-tests) add_subdirectory(UnitTest) -if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND - (NOT TARGET libc.utils.gpu.loader OR NOT TARGET libc.startup.gpu.crt1)) - message(WARNING "Cannot build libc GPU tests, missing loader implementation") +if(LIBC_TARGET_OS_IS_GPU AND + (NOT TARGET libc.utils.gpu.loader OR LIBC_GPU_TESTS_DISABLED)) + message(WARNING "Cannot build libc GPU tests, missing loader or architecture") return() endif() diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt index dca4c5a6f1b14..4f31f10b29f0b 100644 --- a/libc/test/IntegrationTest/CMakeLists.txt +++ b/libc/test/IntegrationTest/CMakeLists.txt @@ -1,21 +1,5 @@ -if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) - set(TEST_COMPILE_FLAGS - -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} - -emit-llvm # AMDGPU's intermediate object file format is bitcode. - --target=${LIBC_GPU_TARGET_TRIPLE} - -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION} # Manually set the ABI. - ) -elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) - set(TEST_COMPILE_FLAGS - -march=${LIBC_GPU_TARGET_ARCHITECTURE} - --target=${LIBC_GPU_TARGET_TRIPLE} - --cuda-path=${LIBC_CUDA_ROOT} - ) -endif() - add_object_library( test - NO_GPU_BUNDLE # Compile this file directly without special GPU handling. SRCS test.cpp COMPILE_OPTIONS diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt index 4a615d4bd5e1c..4668f0061975f 100644 --- a/libc/test/UnitTest/CMakeLists.txt +++ b/libc/test/UnitTest/CMakeLists.txt @@ -12,7 +12,7 @@ function(add_unittest_framework_library name) endif() # The Nvidia 'nvlink' linker does not support static libraries. - if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) + if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) set(library_type OBJECT) else() set(library_type STATIC) diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 9801621e6b399..53fa1323d18b7 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -1,7 +1,7 @@ add_custom_target(libc-support-tests) # FIXME: These tests are currently broken on the GPU. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_libc_test( blockstore_test SUITE @@ -76,7 +76,7 @@ add_libc_test( ) # The GPU does not support varargs currently. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_libc_test( arg_list_test SUITE @@ -88,8 +88,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) ) endif() -# FIXME: Crash in NVPTX target lowering for calls -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( uint_test SUITE @@ -159,29 +158,33 @@ add_libc_test( libc.src.__support.memory_size ) -add_executable( - libc_str_to_float_comparison_test - str_to_float_comparison_test.cpp -) +# FIXME: We shouldn't have regular executables created because we could be +# cross-compiling the tests and running through an emulator. +if(NOT LIBC_TARGET_OS_IS_GPU) + add_executable( + libc_str_to_float_comparison_test + str_to_float_comparison_test.cpp + ) -target_link_libraries(libc_str_to_float_comparison_test - PRIVATE - "${LIBC_TARGET}" -) + target_link_libraries(libc_str_to_float_comparison_test + PRIVATE + "${LIBC_TARGET}" + ) -add_executable( - libc_system_str_to_float_comparison_test - str_to_float_comparison_test.cpp -) + add_executable( + libc_system_str_to_float_comparison_test + str_to_float_comparison_test.cpp + ) -set(float_test_file ${CMAKE_CURRENT_SOURCE_DIR}/str_to_float_comparison_data.txt) + set(float_test_file ${CMAKE_CURRENT_SOURCE_DIR}/str_to_float_comparison_data.txt) -add_custom_command(TARGET libc_str_to_float_comparison_test - POST_BUILD - COMMAND $ ${float_test_file} - DEPENDS ${float_test_file} - COMMENT "Test the strtof and strtod implementations against precomputed results." - VERBATIM) + add_custom_command(TARGET libc_str_to_float_comparison_test + POST_BUILD + COMMAND $ ${float_test_file} + DEPENDS ${float_test_file} + COMMENT "Test the strtof and strtod implementations against precomputed results." + VERBATIM) +endif() add_subdirectory(CPP) add_subdirectory(File) diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt index 6927579289bc2..d7f332f5b0fbd 100644 --- a/libc/test/src/__support/CPP/CMakeLists.txt +++ b/libc/test/src/__support/CPP/CMakeLists.txt @@ -64,7 +64,7 @@ add_libc_test( # This test fails with invalid address space operations on sm_60 -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( atomic_test SUITE diff --git a/libc/test/src/__support/File/CMakeLists.txt b/libc/test/src/__support/File/CMakeLists.txt index f193480c60c2b..9191469b4927c 100644 --- a/libc/test/src/__support/File/CMakeLists.txt +++ b/libc/test/src/__support/File/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT (TARGET libc.src.__support.threads.mutex) OR LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT (TARGET libc.src.__support.threads.mutex) OR LIBC_TARGET_OS_IS_GPU) # Not all platforms have a mutex implementation. If mutex is unvailable, # we just skip everything about files. The GPU does not currently support # files as well. diff --git a/libc/test/src/errno/CMakeLists.txt b/libc/test/src/errno/CMakeLists.txt index 633d46a1f5f88..b73962fb4de4d 100644 --- a/libc/test/src/errno/CMakeLists.txt +++ b/libc/test/src/errno/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LLVM_LIBC_FULL_BUILD OR LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LLVM_LIBC_FULL_BUILD OR LIBC_TARGET_OS_IS_GPU) return() endif() diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 8c105515e3525..81d2e1e55b552 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1,10 +1,14 @@ add_custom_target(libc-math-unittests) -add_library( - libc_math_test_utils - RandUtils.cpp - RandUtils.h -) +# FIXME: We shouldn't have regular libraries created because we could be +# cross-compiling the tests and running through an emulator. +if(NOT LIBC_TARGET_OS_IS_GPU) + add_library( + libc_math_test_utils + RandUtils.cpp + RandUtils.h + ) +endif() add_fp_unittest( cosf_test @@ -755,7 +759,7 @@ add_fp_unittest( ) # FIXME: These tests are currently broken for NVPTX. -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( ilogb_test SUITE @@ -986,7 +990,7 @@ add_fp_unittest( ) # FIXME: These tests are currently broken on the GPU. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_fp_unittest( fminf_test SUITE @@ -1231,7 +1235,7 @@ add_fp_unittest( ) # FIXME: These tests are currently spurious for NVPTX. -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( nextafter_test SUITE diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 1824c672cb974..2d24b5a76b013 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -819,7 +819,7 @@ add_fp_unittest( ) # FIXME: These tests are currently broken for NVPTX. -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( ilogb_test SUITE @@ -1073,7 +1073,7 @@ add_fp_unittest( ) # FIXME: These tests are currently broken on the GPU. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_fp_unittest( fminf_test SUITE @@ -1417,7 +1417,7 @@ add_fp_unittest( ) # FIXME: These tests are currently spurious for NVPTX. -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( nextafter_test SUITE @@ -1465,7 +1465,7 @@ add_fp_unittest( ) # FIXME: These tests are currently spurious for the GPU. -if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(NOT LIBC_TARGET_OS_IS_GPU) add_fp_unittest( nexttoward_test SUITE diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index 8db2293ab74a9..93c21aa994ef4 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -430,7 +430,7 @@ add_libc_test( # Create an output directory for any temporary test files. file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/testdata) -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) return() endif() diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index da07dbbe79772..5826cfe8d4ca3 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -55,7 +55,7 @@ add_libc_test( ) # This fails on NVPTX where the output value is one-off of the expected value. -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( strtod_test SUITE @@ -127,7 +127,7 @@ add_libc_test( ) # This fails on NVPTX where the output value is one-off of the expected value. -if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( strtold_test SUITE @@ -339,7 +339,7 @@ if(LLVM_LIBC_FULL_BUILD) ) # Only the GPU has an in-tree 'malloc' implementation. - if(LIBC_TARGET_ARCHITECTURE_IS_GPU) + if(LIBC_TARGET_OS_IS_GPU) add_libc_test( malloc_test HERMETIC_TEST_ONLY diff --git a/libc/test/utils/UnitTest/CMakeLists.txt b/libc/test/utils/UnitTest/CMakeLists.txt index 6f61e0ffefb00..3b917e06cde21 100644 --- a/libc/test/utils/UnitTest/CMakeLists.txt +++ b/libc/test/utils/UnitTest/CMakeLists.txt @@ -1,4 +1,4 @@ -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) return() endif() diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt index 9754dcf3854aa..7bf02a4af7dea 100644 --- a/libc/utils/CMakeLists.txt +++ b/libc/utils/CMakeLists.txt @@ -1,6 +1,6 @@ if(LLVM_INCLUDE_TESTS) add_subdirectory(MPFRWrapper) endif() -if(LIBC_TARGET_ARCHITECTURE_IS_GPU) +if(LIBC_TARGET_OS_IS_GPU) add_subdirectory(gpu) endif() diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt index adc073c9a91f5..6f44ca0d786c8 100644 --- a/libc/utils/MPFRWrapper/CMakeLists.txt +++ b/libc/utils/MPFRWrapper/CMakeLists.txt @@ -24,6 +24,6 @@ if(LIBC_TESTS_CAN_USE_MPFR) target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib) endif() target_link_libraries(libcMPFRWrapper PUBLIC LibcFPTestHelpers.unit LibcTest.unit mpfr gmp) -elseif(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) +elseif(NOT LIBC_TARGET_OS_IS_GPU) message(WARNING "Math tests using MPFR will be skipped.") endif() diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt index 7c15f36052cf3..4d1ebcfb9f8e6 100644 --- a/libc/utils/gpu/CMakeLists.txt +++ b/libc/utils/gpu/CMakeLists.txt @@ -1,2 +1,4 @@ add_subdirectory(server) -add_subdirectory(loader) +if(LIBC_TARGET_OS_IS_GPU) + add_subdirectory(loader) +endif() diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt index f195b887c9af6..189460bb02e6e 100644 --- a/libc/utils/gpu/loader/CMakeLists.txt +++ b/libc/utils/gpu/loader/CMakeLists.txt @@ -1,31 +1,30 @@ add_library(gpu_loader OBJECT Main.cpp) + target_include_directories(gpu_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include ${LIBC_SOURCE_DIR} ) +# This utility needs to be compiled for the host system when cross compiling. +if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE) + target_compile_options(gpu_loader PUBLIC --target=${LLVM_HOST_TRIPLE}) + target_link_libraries(gpu_loader PUBLIC "--target=${LLVM_HOST_TRIPLE}") +endif() + find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) -if(hsa-runtime64_FOUND) +if(hsa-runtime64_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) add_subdirectory(amdgpu) -else() +elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) message(STATUS "Skipping HSA loader for gpu target, no HSA was detected") endif() -find_package(CUDAToolkit QUIET) # The CUDA loader requires LLVM to traverse the ELF image for symbols. find_package(LLVM QUIET) -if(CUDAToolkit_FOUND AND LLVM_FOUND AND - "${CUDAToolkit_VERSION}" VERSION_GREATER_EQUAL "11.2") +if(CUDAToolkit_FOUND AND LLVM_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_subdirectory(nvptx) -else() - if("${CUDAToolkit_VERSION}" VERSION_LESS "11.2") - message(WARNING - "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later. - Found CUDA Version ${CUDAToolkit_VERSION}") - else() - message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") - endif() +elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected") endif() # Add a custom target to be used for testing. @@ -37,20 +36,31 @@ if(LIBC_GPU_LOADER_EXECUTABLE) PROPERTIES EXECUTABLE "${LIBC_GPU_LOADER_EXECUTABLE}" ) -elseif(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU) +elseif(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) add_custom_target(libc.utils.gpu.loader) - add_dependencies(libc.utils.gpu.loader amdhsa_loader) + add_dependencies(libc.utils.gpu.loader amdhsa-loader) set_target_properties( libc.utils.gpu.loader PROPERTIES - EXECUTABLE "$" + TARGET amdhsa-loader + EXECUTABLE "$" ) -elseif(TARGET nvptx_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) +elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_custom_target(libc.utils.gpu.loader) - add_dependencies(libc.utils.gpu.loader nvptx_loader) + add_dependencies(libc.utils.gpu.loader nvptx-loader) set_target_properties( libc.utils.gpu.loader PROPERTIES - EXECUTABLE "$" + TARGET nvptx-loader + EXECUTABLE "$" ) endif() + +if(TARGET libc.utils.gpu.loader) + get_target_property(gpu_loader_tgt libc.utils.gpu.loader "TARGET") + if(gpu_loader_tgt) + install(TARGETS ${gpu_loader_tgt} + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT libc) + endif() +endif() diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt index 8e9c9a2bdc7d2..b99319f504011 100644 --- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt +++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt @@ -1,7 +1,7 @@ -add_executable(amdhsa_loader Loader.cpp) -add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc) +add_executable(amdhsa-loader Loader.cpp) +add_dependencies(amdhsa-loader libc.src.__support.RPC.rpc) -target_link_libraries(amdhsa_loader +target_link_libraries(amdhsa-loader PRIVATE hsa-runtime64::hsa-runtime64 gpu_loader diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt index 0c76c49fa3098..e76362a1e8cca 100644 --- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt +++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt @@ -1,11 +1,11 @@ -add_executable(nvptx_loader Loader.cpp) -add_dependencies(nvptx_loader libc.src.__support.RPC.rpc) +add_executable(nvptx-loader Loader.cpp) +add_dependencies(nvptx-loader libc.src.__support.RPC.rpc) if(NOT LLVM_ENABLE_RTTI) - target_compile_options(nvptx_loader PRIVATE -fno-rtti) + target_compile_options(nvptx-loader PRIVATE -fno-rtti) endif() -target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS}) -target_link_libraries(nvptx_loader +target_include_directories(nvptx-loader PRIVATE ${LLVM_INCLUDE_DIRS}) +target_link_libraries(nvptx-loader PRIVATE gpu_loader llvmlibc_rpc_server diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt index 3d9b2bcab4dbc..94cdfe5bf6521 100644 --- a/libc/utils/gpu/server/CMakeLists.txt +++ b/libc/utils/gpu/server/CMakeLists.txt @@ -5,12 +5,21 @@ target_include_directories(llvmlibc_rpc_server PRIVATE ${LIBC_SOURCE_DIR}) target_include_directories(llvmlibc_rpc_server PUBLIC ${LIBC_SOURCE_DIR}/include) target_include_directories(llvmlibc_rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + # Ignore unsupported clang attributes if we're using GCC. target_compile_options(llvmlibc_rpc_server PUBLIC $<$:-Wno-attributes>) target_compile_definitions(llvmlibc_rpc_server PUBLIC LIBC_NAMESPACE=${LIBC_NAMESPACE}) +# This utility needs to be compiled for the host system when cross compiling. +if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE) + target_compile_options(llvmlibc_rpc_server PUBLIC + --target=${LLVM_HOST_TRIPLE}) + target_link_libraries(llvmlibc_rpc_server PUBLIC + "--target=${LLVM_HOST_TRIPLE}") +endif() + # Install the server and associated header. install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/rpc_server.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm/ diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index dbd5fbf226bd5..f5f7d3f3253fd 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -175,7 +175,9 @@ else() foreach(_name ${LLVM_RUNTIME_TARGETS}) if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES) set(NEED_LIBC_HDRGEN TRUE) - break() + if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda") + set(LLVM_LIBC_GPU_BUILD ON) + endif() endif() endforeach() endif() diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 486df22c2c1bb..4257083e53ad4 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -120,6 +120,13 @@ if( LLVM_ENABLE_ASSERTIONS ) endif() endif() +# If we are targeting a GPU architecture we want to ignore all the standard +# flag handling. +if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR + "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx64") + return() +endif() + if(LLVM_ENABLE_EXPENSIVE_CHECKS) add_compile_definitions(EXPENSIVE_CHECKS) diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 8c48d85a4346f..9b5e758b6ede5 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -199,7 +199,7 @@ foreach(entry ${runtimes}) list(APPEND prefixes "LLVM_LIBC") list(APPEND prefixes "LIBC_") # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode. - if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES) + if(LLVM_LIBC_GPU_BUILD) list(APPEND prefixes "CUDA") endif() endif() @@ -424,7 +424,7 @@ if(runtimes) endforeach() endif() if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND - (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)) + (LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD)) if(LIBC_HDRGEN_EXE) set(hdrgen_exe ${LIBC_HDRGEN_EXE}) else() @@ -441,7 +441,12 @@ if(runtimes) set(libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}" "-DLLVM_LIBC_FULL_BUILD=ON") list(APPEND extra_deps ${hdrgen_deps}) - if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES) + if(LLVM_LIBC_GPU_BUILD) + list(APPEND libc_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON") + # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode. + if(CUDAToolkit_ROOT) + list(APPEND libc_cmake_args "-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}") + endif() foreach(dep clang-offload-packager nvptx-arch amdgpu-arch) if(TARGET ${dep}) list(APPEND extra_deps ${dep}) diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt index 17e61d0bc47dc..a74eff0c0bebf 100644 --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -119,14 +119,7 @@ endif() pythonize_bool(LIBOMPTARGET_OMPT_SUPPORT) -# Check if this build supports the GPU libc. -set(LIBC_GPU_SUPPORT FALSE) -if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND (LIBC_GPU_BUILD OR - LIBC_GPU_ARCHITECTURES)) - set(LIBC_GPU_SUPPORT TRUE) -endif() - -set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LIBC_GPU_SUPPORT} CACHE BOOL +set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL "Libomptarget support for the GPU libc") pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT) diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt index 8ae3ff2a6d291..085d443071650 100644 --- a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt +++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt @@ -73,8 +73,12 @@ elseif(${LIBOMPTARGET_GPU_LIBC_SUPPORT}) find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH) if(llvmlibc_rpc_server) - target_link_libraries(PluginCommon PRIVATE llvmlibc_rpc_server) + target_link_libraries(PluginCommon PRIVATE ${llvmlibc_rpc_server}) target_compile_definitions(PluginCommon PRIVATE LIBOMPTARGET_RPC_SUPPORT) + # We may need to get the headers directly from the 'libc' source directory. + target_include_directories(PluginCommon PRIVATE + ${CMAKE_SOURCE_DIR}/../libc/utils/gpu/server + ${CMAKE_SOURCE_DIR}/../libc/include) endif() endif() diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp index 54aced11b31c3..cb6a5086bc4dd 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp @@ -18,7 +18,8 @@ #if __has_include() #include #elif defined(LIBOMPTARGET_RPC_SUPPORT) -#include +// Just pull this out of the source if available. +#include "rpc_server.h" #endif using namespace llvm; diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg index 565556e64ff29..6c590603079c4 100644 --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -180,8 +180,12 @@ def remove_suffix_if_present(name): def add_libraries(source): if config.libomptarget_has_libc: - return source + " " + config.llvm_library_dir + "/libcgpu.a " + \ - config.llvm_library_intdir + "/libomptarget.devicertl.a" + if config.libomptarget_current_target.startswith('nvptx'): + return source + " " + config.llvm_library_dir + "/libcgpu-nvptx.a " + \ + config.llvm_library_intdir + "/libomptarget.devicertl.a" + elif config.libomptarget_current_target.startswith('amdgcn'): + return source + " " + config.llvm_library_dir + "/libcgpu-amdgpu.a " + \ + config.llvm_library_intdir + "/libomptarget.devicertl.a" return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a" # substitutions From 3ed4b95bcf2039e7293f45e3b3fdf26b81dc319f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 15:37:33 -0600 Subject: [PATCH 276/351] [Flang] Fix test not updated after 'clang' case change Summary: The shared 'clang' code changed this slightly but did not update the flang test. --- flang/test/Driver/omp-driver-offload.f90 | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90 index b45ed70195fb4..23c2a121a5afa 100644 --- a/flang/test/Driver/omp-driver-offload.f90 +++ b/flang/test/Driver/omp-driver-offload.f90 @@ -172,13 +172,25 @@ ! Check that `-gpulibc` includes the LLVM C libraries for the GPU. ! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ -! RUN: --offload-arch=gfx90a --offload-arch=sm_52 \ +! RUN: --offload-arch=sm_52 \ ! RUN: -gpulibc %s 2>&1 \ -! RUN: | FileCheck --check-prefix=LIBC-GPU %s -! LIBC-GPU: "-lcgpu"{{.*}}"-lmgpu" +! RUN: | FileCheck --check-prefix=LIBC-GPU-NVPTX %s +! LIBC-GPU-NVPTX: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx" ! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ -! RUN: --offload-arch=gfx90a --offload-arch=sm_52 \ +! RUN: --offload-arch=sm_52 \ ! RUN: -nogpulibc %s 2>&1 \ -! RUN: | FileCheck --check-prefix=NO-LIBC-GPU %s -! NO-LIBC-GPU-NOT: "-lcgpu"{{.*}}"-lmgpu" +! RUN: | FileCheck --check-prefix=NO-LIBC-GPU-NVPTX %s +! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx" + +! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ +! RUN: --offload-arch=gfx90a \ +! RUN: -gpulibc %s 2>&1 \ +! RUN: | FileCheck --check-prefix=LIBC-GPU-AMDGPU %s +! LIBC-GPU-AMDGPU: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu" + +! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ +! RUN: --offload-arch=gfx90a \ +! RUN: -nogpulibc %s 2>&1 \ +! RUN: | FileCheck --check-prefix=NO-LIBC-GPU-AMDGPU %s +! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu" From 72763521c34287bce68402eb2a9d71dcb4eed5a0 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 22 Feb 2024 22:48:47 +0100 Subject: [PATCH 277/351] [LSR] Clear SCEVExpander before calling DeleteDeadPHIs To avoid an assertion failure when an AssertingVH is removed, as reported in: https://github.com/llvm/llvm-project/pull/82362#issuecomment-1960067147 Also remove an unnecessary use of SCEVExpanderCleaner. --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 4 +- .../RISCV/term-fold-crash.ll | 43 +++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 627c863f7091f..08021f3ba853e 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -7033,7 +7033,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // SCEVExpander for both use in preheader and latch const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); SCEVExpander Expander(SE, DL, "lsr_fold_term_cond"); - SCEVExpanderCleaner ExpCleaner(Expander); assert(Expander.isSafeToExpand(TermValueS) && "Terminating value was checked safe in canFoldTerminatingCondition"); @@ -7064,10 +7063,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, BI->setCondition(NewTermCond); + Expander.clear(); OldTermCond->eraseFromParent(); DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); - - ExpCleaner.markResultUsed(); } } diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll new file mode 100644 index 0000000000000..8ca7f0010bbbe --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=loop-reduce -mtriple=riscv64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(ptr %p, i8 %arg, i32 %start) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[P:%.*]], i8 [[ARG:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[ARG]] to i32 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[CONV]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[START]], [[SHR]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 1 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ADD810:%.*]] = phi i32 [ [[START]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[ADD810]] to i64 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[P]], i64 [[IDXPROM2]] +; CHECK-NEXT: [[V:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[ADD]] = add i32 [[ADD810]], 1 +; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq i32 [[ADD]], [[TMP1]] +; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %conv = zext i8 %arg to i32 + %shr = lshr i32 %conv, 1 + %wide.trip.count = zext nneg i32 %shr to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %add810 = phi i32 [ %start, %entry ], [ %add, %for.body ] + %idxprom2 = zext i32 %add810 to i64 + %arrayidx3 = getelementptr i8, ptr %p, i64 %idxprom2 + %v = load i8, ptr %arrayidx3, align 1 + %add = add i32 %add810, 1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv, %wide.trip.count + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} From d4bfca3b2e673789f7c278d46a199ae8910ddd37 Mon Sep 17 00:00:00 2001 From: Wentao Zhang <35722712+whentojump@users.noreply.github.com> Date: Thu, 22 Feb 2024 16:04:25 -0600 Subject: [PATCH 278/351] [clang][CodeGen] Keep processing the rest of AST after encountering unsupported MC/DC expressions (#82464) Currently, upon seeing unsupported decisions (more than 6 conditions, or split nesting), the post-visitor hook dataTraverseStmtPost() returns a false. As a result, in the rest of tree even supported decisions will be skipped as well. Like in the below code: { // CompoundStmt a && b; // 1: BinaryOperator (supported) a && foo(b && c); // 2: BinaryOperator (not yet supported due to split // nesting) a && b; // 3: BinaryOperator (supported) } Decision 3 will not be processed at all. And only one "Decision" region will be emitted. Compiler explorer example: https://godbolt.org/z/Px61sesoo We hope to process such cases and emit two "Decision" regions (1 and 3) in the above example. --- clang/lib/CodeGen/CodeGenPGO.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 48c5e68a3b7ba..1ef7be3c72593 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -239,9 +239,12 @@ struct MapRegionCounters : public RecursiveASTVisitor { if (MCDCMaxCond == 0) return true; - /// At the top of the logical operator nest, reset the number of conditions. - if (LogOpStack.empty()) + /// At the top of the logical operator nest, reset the number of conditions, + /// also forget previously seen split nesting cases. + if (LogOpStack.empty()) { NumCond = 0; + SplitNestedLogicalOp = false; + } if (const Expr *E = dyn_cast(S)) { const BinaryOperator *BinOp = dyn_cast(E->IgnoreParens()); @@ -292,7 +295,7 @@ struct MapRegionCounters : public RecursiveASTVisitor { "contains an operation with a nested boolean expression. " "Expression will not be covered"); Diag.Report(S->getBeginLoc(), DiagID); - return false; + return true; } /// Was the maximum number of conditions encountered? @@ -303,7 +306,7 @@ struct MapRegionCounters : public RecursiveASTVisitor { "number of conditions (%0) exceeds max (%1). " "Expression will not be covered"); Diag.Report(S->getBeginLoc(), DiagID) << NumCond << MCDCMaxCond; - return false; + return true; } // Otherwise, allocate the number of bytes required for the bitmap From ae3e14276b7181ae51e9ef731f44f813a1a3f123 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Thu, 22 Feb 2024 22:04:17 +0000 Subject: [PATCH 279/351] Fix test/Dialect/Vector/vector-transfer-flatten.mlir --- mlir/test/Dialect/Vector/vector-transfer-flatten.mlir | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir index 3b6441d0c9560..2766e782a3fb2 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -475,6 +475,8 @@ func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, str // CHECK: %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>> // CHECK: %[[APPLY:.*]] = affine.apply #[[$MAP]]() +// CHECK-128B-LABEL: func @regression_non_contiguous_dim_read( + // ----- func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>, @@ -487,3 +489,5 @@ func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>, // CHECK-LABEL: func.func @unsupported_non_contiguous_dim_write( // CHECK-NOT: memref.collapse_shape + +// CHECK-128B-LABEL: func @unsupported_non_contiguous_dim_write( From e2f08268304dc972440391c43bf1d47e28fad93e Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 22 Feb 2024 14:11:10 -0800 Subject: [PATCH 280/351] [MLIR] Fix LLVM dialect specification to use AnySignlessInteger instead of AnyInteger (#82694) LLVM IR does not support signed integer, the LLVM dialect was underspecified (likely unintentionally) and the AnyInteger constraint was overly lax. The arithmetic dialect is already consistently using AnySignlessInteger. --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 46 ++++++++++----------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index d9b130bdf18cb..3da5deeb4ec7e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -49,7 +49,7 @@ class LLVM_ArithmeticOpBase traits = []> : - LLVM_ArithmeticOpBase { + LLVM_ArithmeticOpBase { let arguments = commonArgs; string mlirBuilder = [{ $res = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs); @@ -57,7 +57,7 @@ class LLVM_IntArithmeticOp traits = []> : - LLVM_ArithmeticOpBase], traits)> { dag iofArg = ( ins DefaultValuedAttr:$overflowFlags); @@ -143,9 +143,9 @@ class LLVM_ArithmeticCmpOp traits = []> : // Other integer operations. def LLVM_ICmpOp : LLVM_ArithmeticCmpOp<"icmp", [Pure]> { let arguments = (ins ICmpPredicate:$predicate, - AnyTypeOf<[LLVM_ScalarOrVectorOf, + AnyTypeOf<[LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf]>:$lhs, - AnyTypeOf<[LLVM_ScalarOrVectorOf, + AnyTypeOf<[LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf]>:$rhs); let hasCustomAssemblyFormat = 1; string llvmInstName = "ICmp"; @@ -204,7 +204,7 @@ def LLVM_AllocaOp : LLVM_Op<"alloca", DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>, LLVM_MemOpPatterns { - let arguments = (ins AnyInteger:$arraySize, + let arguments = (ins AnySignlessInteger:$arraySize, OptionalAttr:$alignment, TypeAttr:$elem_type, UnitAttr:$inalloca); @@ -250,7 +250,7 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let arguments = (ins LLVM_ScalarOrVectorOf:$base, - Variadic>:$dynamicIndices, + Variadic>:$dynamicIndices, DenseI32ArrayAttr:$rawConstantIndices, TypeAttr:$elem_type, UnitAttr:$inbounds); @@ -499,37 +499,37 @@ def LLVM_AddrSpaceCastOp : LLVM_CastOp<"addrspacecast", "AddrSpaceCast", let hasFolder = 1; } def LLVM_IntToPtrOp : LLVM_CastOp<"inttoptr", "IntToPtr", - LLVM_ScalarOrVectorOf, + LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; def LLVM_PtrToIntOp : LLVM_CastOp<"ptrtoint", "PtrToInt", LLVM_ScalarOrVectorOf, - LLVM_ScalarOrVectorOf>; + LLVM_ScalarOrVectorOf>; def LLVM_SExtOp : LLVM_CastOp<"sext", "SExt", - LLVM_ScalarOrVectorOf, - LLVM_ScalarOrVectorOf> { + LLVM_ScalarOrVectorOf, + LLVM_ScalarOrVectorOf> { let hasVerifier = 1; } def LLVM_ZExtOp : LLVM_CastOp<"zext", "ZExt", - LLVM_ScalarOrVectorOf, - LLVM_ScalarOrVectorOf> { + LLVM_ScalarOrVectorOf, + LLVM_ScalarOrVectorOf> { let hasFolder = 1; let hasVerifier = 1; } def LLVM_TruncOp : LLVM_CastOp<"trunc", "Trunc", - LLVM_ScalarOrVectorOf, - LLVM_ScalarOrVectorOf>; + LLVM_ScalarOrVectorOf, + LLVM_ScalarOrVectorOf>; def LLVM_SIToFPOp : LLVM_CastOp<"sitofp", "SIToFP", - LLVM_ScalarOrVectorOf, + LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; def LLVM_UIToFPOp : LLVM_CastOp<"uitofp", "UIToFP", - LLVM_ScalarOrVectorOf, + LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; def LLVM_FPToSIOp : LLVM_CastOp<"fptosi", "FPToSI", LLVM_ScalarOrVectorOf, - LLVM_ScalarOrVectorOf>; + LLVM_ScalarOrVectorOf>; def LLVM_FPToUIOp : LLVM_CastOp<"fptoui", "FPToUI", LLVM_ScalarOrVectorOf, - LLVM_ScalarOrVectorOf>; + LLVM_ScalarOrVectorOf>; def LLVM_FPExtOp : LLVM_CastOp<"fpext", "FPExt", LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; @@ -671,7 +671,7 @@ def LLVM_ExtractElementOp : LLVM_Op<"extractelement", [Pure, "LLVM::getVectorElementType($_self)">]> { let summary = "Extract an element from an LLVM vector."; - let arguments = (ins LLVM_AnyVector:$vector, AnyInteger:$position); + let arguments = (ins LLVM_AnyVector:$vector, AnySignlessInteger:$position); let results = (outs LLVM_Type:$res); let assemblyFormat = [{ @@ -733,7 +733,7 @@ def LLVM_InsertElementOp : LLVM_Op<"insertelement", [Pure, let summary = "Insert an element into an LLVM vector."; let arguments = (ins LLVM_AnyVector:$vector, LLVM_PrimitiveType:$value, - AnyInteger:$position); + AnySignlessInteger:$position); let results = (outs LLVM_AnyVector:$res); let builders = [LLVM_OneResultOpBuilder]; @@ -971,7 +971,7 @@ def LLVM_SwitchOp : LLVM_TerminatorOp<"switch", DeclareOpInterfaceMethods, Pure]> { let arguments = (ins - AnyInteger:$value, + AnySignlessInteger:$value, Variadic:$defaultOperands, VariadicOfVariadic:$caseOperands, OptionalAttr:$case_values, @@ -1647,7 +1647,7 @@ def LLVM_ConstantOp // Atomic operations. // -def LLVM_AtomicRMWType : AnyTypeOf<[LLVM_AnyFloat, LLVM_AnyPointer, AnyInteger]>; +def LLVM_AtomicRMWType : AnyTypeOf<[LLVM_AnyFloat, LLVM_AnyPointer, AnySignlessInteger]>; def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [ TypesMatchWith<"result #0 and operand #1 have the same type", @@ -1696,7 +1696,7 @@ def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [ let hasVerifier = 1; } -def LLVM_AtomicCmpXchgType : AnyTypeOf<[AnyInteger, LLVM_AnyPointer]>; +def LLVM_AtomicCmpXchgType : AnyTypeOf<[AnySignlessInteger, LLVM_AnyPointer]>; def LLVM_AtomicCmpXchgOp : LLVM_MemAccessOpBase<"cmpxchg", [ TypesMatchWith<"operand #1 and operand #2 have the same type", From e314622f204a01ffeda59cbe046dd403b01f8b74 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Thu, 22 Feb 2024 14:26:11 -0800 Subject: [PATCH 281/351] [clang][driver] Allow unaligned access on ARMv7 and higher by default (#82400) ARM's Clang and GCC embedded compilers default to allowing unaligned access for ARMv7+. This patch changes the Clang driver default to match. Users can opt out with `-mno-unaligned-access`. Fixes #59560 --- clang/docs/ReleaseNotes.rst | 11 +++++++++++ clang/lib/Driver/ToolChains/Arch/ARM.cpp | 24 ++++++++++++------------ clang/test/Driver/arm-alignment.c | 15 +++++++++++++++ 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 74bb9a07f0b13..19cc5b7756431 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -302,6 +302,17 @@ X86 Support Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ +- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and + Armv8-M without the Main Extension. Baremetal targets should check that the + new default will work with their system configurations, since it requires + that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is + configured as "normal" memory. This brings Clang in-line with the default + settings for GCC and Arm Compiler. Aside from making Clang align with other + compilers, changing the default brings major performance and code size + improvements for most targets. We have not changed the default behavior for + ARMv6, but may revisit that decision in the future. Users can restore the old + behavior with -m[no-]unaligned-access. + Android Support ^^^^^^^^^^^^^^^ diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp index e6ee2f88a84ed..ba158b92bb44b 100644 --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -890,25 +890,25 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D, // SCTLR.U bit, which is architecture-specific. We assume ARMv6 // Darwin and NetBSD targets support unaligned accesses, and others don't. // - // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit - // which raises an alignment fault on unaligned accesses. Linux - // defaults this bit to 0 and handles it as a system-wide (not - // per-process) setting. It is therefore safe to assume that ARMv7+ - // Linux targets support unaligned accesses. The same goes for NaCl - // and Windows. + // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit which + // raises an alignment fault on unaligned accesses. Assume ARMv7+ supports + // unaligned accesses, except ARMv6-M, and ARMv8-M without the Main + // Extension. This aligns with the default behavior of ARM's downstream + // versions of GCC and Clang. // - // The above behavior is consistent with GCC. + // Users can change the default behavior via -m[no-]unaliged-access. int VersionNum = getARMSubArchVersionNumber(Triple); if (Triple.isOSDarwin() || Triple.isOSNetBSD()) { if (VersionNum < 6 || Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m) Features.push_back("+strict-align"); - } else if (Triple.isOSLinux() || Triple.isOSNaCl() || - Triple.isOSWindows()) { - if (VersionNum < 7) - Features.push_back("+strict-align"); - } else + } else if (VersionNum < 7 || + Triple.getSubArch() == + llvm::Triple::SubArchType::ARMSubArch_v6m || + Triple.getSubArch() == + llvm::Triple::SubArchType::ARMSubArch_v8m_baseline) { Features.push_back("+strict-align"); + } } // llvm does not support reserving registers in general. There is support diff --git a/clang/test/Driver/arm-alignment.c b/clang/test/Driver/arm-alignment.c index 9177b625729b8..8c915477af9af 100644 --- a/clang/test/Driver/arm-alignment.c +++ b/clang/test/Driver/arm-alignment.c @@ -22,6 +22,21 @@ // RUN: %clang -target armv7-windows -### %s 2> %t // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s +// RUN: %clang --target=armv6 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s + +// RUN: %clang --target=armv7 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s + +// RUN: %clang -target thumbv6m-none-gnueabi -mcpu=cortex-m0 -### %s 2> %t +// RUN: FileCheck --check-prefix CHECK-ALIGNED-ARM <%t %s + +// RUN: %clang -target thumb-none-gnueabi -mcpu=cortex-m0 -### %s 2> %t +// RUN: FileCheck --check-prefix CHECK-ALIGNED-ARM <%t %s + +// RUN: %clang -target thumbv8m.base-none-gnueabi -### %s 2> %t +// RUN: FileCheck --check-prefix CHECK-ALIGNED-ARM <%t %s + // RUN: %clang --target=aarch64 -munaligned-access -### %s 2> %t // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s From d5a15f3116f8c3ec32df1f13a2fc521a98b03d96 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 16:27:53 -0600 Subject: [PATCH 282/351] [Clang][NVPTX] Allow passing arguments to the linker while standalone (#73030) Summary: We support standalone compilation for the NVPTX architecture using 'nvlink' as our linker. Because of the special handling required to transform input files to cubins, as nvlink expects for some reason, we didn't use the standard AddLinkerInput method. However, this also meant that we weren't forwarding options passed with -Wl to the linker. Add this support in for the standalone toolchain path. Revived from https://reviews.llvm.org/D149978 --- clang/lib/Driver/ToolChains/Cuda.cpp | 43 +++++++++---------- clang/test/Driver/cuda-cross-compiling.c | 9 +++- .../ClangLinkerWrapper.cpp | 4 +- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index ed5924c3b73b5..94d4982d102bb 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -623,35 +623,34 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, continue; } - // Currently, we only pass the input files to the linker, we do not pass - // any libraries that may be valid only for the host. - if (!II.isFilename()) - continue; - // The 'nvlink' application performs RDC-mode linking when given a '.o' // file and device linking when given a '.cubin' file. We always want to // perform device linking, so just rename any '.o' files. // FIXME: This should hopefully be removed if NVIDIA updates their tooling. - auto InputFile = getToolChain().getInputFilename(II); - if (llvm::sys::path::extension(InputFile) != ".cubin") { - // If there are no actions above this one then this is direct input and we - // can copy it. Otherwise the input is internal so a `.cubin` file should - // exist. - if (II.getAction() && II.getAction()->getInputs().size() == 0) { - const char *CubinF = - Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath( - llvm::sys::path::stem(InputFile), "cubin")); - if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF))) - continue; + if (II.isFilename()) { + auto InputFile = getToolChain().getInputFilename(II); + if (llvm::sys::path::extension(InputFile) != ".cubin") { + // If there are no actions above this one then this is direct input and + // we can copy it. Otherwise the input is internal so a `.cubin` file + // should exist. + if (II.getAction() && II.getAction()->getInputs().size() == 0) { + const char *CubinF = + Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath( + llvm::sys::path::stem(InputFile), "cubin")); + if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF))) + continue; - CmdArgs.push_back(CubinF); + CmdArgs.push_back(CubinF); + } else { + SmallString<256> Filename(InputFile); + llvm::sys::path::replace_extension(Filename, "cubin"); + CmdArgs.push_back(Args.MakeArgString(Filename)); + } } else { - SmallString<256> Filename(InputFile); - llvm::sys::path::replace_extension(Filename, "cubin"); - CmdArgs.push_back(Args.MakeArgString(Filename)); + CmdArgs.push_back(Args.MakeArgString(InputFile)); } - } else { - CmdArgs.push_back(Args.MakeArgString(InputFile)); + } else if (!II.isNothing()) { + II.getInputArg().renderAsInput(Args, CmdArgs); } } diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c index 6c9e2cac736b7..25058358b63a8 100644 --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -69,6 +69,13 @@ // LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor" // +// Test passing arguments directly to nvlink. +// +// RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -### %s 2>&1 \ +// RUN: | FileCheck -check-prefix=LINKER-ARGS %s + +// LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b" + // Tests for handling a missing architecture. // // RUN: not %clang -target nvptx64-nvidia-cuda %s -### 2>&1 \ @@ -80,4 +87,4 @@ // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \ // RUN: | FileCheck -check-prefix=GENERIC %s -// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu" +// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu" \ No newline at end of file diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 095cf5ed38169..576e8f2cd7f8f 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -454,9 +454,11 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { Triple.isAMDGPU() ? Args.MakeArgString("-mcpu=" + Arch) : Args.MakeArgString("-march=" + Arch), Args.MakeArgString("-" + OptLevel), - "-Wl,--no-undefined", }; + if (!Triple.isNVPTX()) + CmdArgs.push_back("-Wl,--no-undefined"); + for (StringRef InputFile : InputFiles) CmdArgs.push_back(InputFile); From 018c992879248ad28a04fc7d061922f5ccee4e08 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 16:29:09 -0600 Subject: [PATCH 283/351] [Flang] Fix the test ordering of the GPU libraries Summary: Turns out these are out of order --- flang/test/Driver/omp-driver-offload.f90 | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90 index 23c2a121a5afa..9b62699030c68 100644 --- a/flang/test/Driver/omp-driver-offload.f90 +++ b/flang/test/Driver/omp-driver-offload.f90 @@ -175,22 +175,24 @@ ! RUN: --offload-arch=sm_52 \ ! RUN: -gpulibc %s 2>&1 \ ! RUN: | FileCheck --check-prefix=LIBC-GPU-NVPTX %s -! LIBC-GPU-NVPTX: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx" +! LIBC-GPU-NVPTX-DAG: "-lcgpu-nvptx" +! LIBC-GPU-NVPTX-DAG: "-lmgpu-nvptx" ! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ ! RUN: --offload-arch=sm_52 \ ! RUN: -nogpulibc %s 2>&1 \ ! RUN: | FileCheck --check-prefix=NO-LIBC-GPU-NVPTX %s -! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx" +! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx" ! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ ! RUN: --offload-arch=gfx90a \ ! RUN: -gpulibc %s 2>&1 \ ! RUN: | FileCheck --check-prefix=LIBC-GPU-AMDGPU %s -! LIBC-GPU-AMDGPU: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu" +! LIBC-GPU-AMDGPU-DAG: "-lcgpu-amdgpu" +! LIBC-GPU-AMDGPU-DAG: "-lmgpu-amdgpu" ! RUN: %flang -### --target=x86_64-unknown-linux-gnu -fopenmp \ ! RUN: --offload-arch=gfx90a \ ! RUN: -nogpulibc %s 2>&1 \ ! RUN: | FileCheck --check-prefix=NO-LIBC-GPU-AMDGPU %s -! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu" +! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu" From bc5aba9dd63f919037aded04405f3e05092c9039 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Thu, 22 Feb 2024 17:31:11 -0500 Subject: [PATCH 284/351] [CodeGen][MIR][UnitTests] Fix shared build. NFC --- llvm/unittests/MIR/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt index f485dcbd971b6..0ad52134a34da 100644 --- a/llvm/unittests/MIR/CMakeLists.txt +++ b/llvm/unittests/MIR/CMakeLists.txt @@ -1,5 +1,6 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} + Analysis CodeGen CodeGenTypes Core From 87b410821148402d74ac7a14bed233078a49cb7b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 16:49:21 -0600 Subject: [PATCH 285/351] [Libomptarget][NFC] Remove concept of optional plugin functions (#82681) Summary: Ever since the introduction of the new plugins we haven't exercised the concept of "optional" plugin functions. This is done in perparation for making the plugins use a static interface as it will greatly simplify the implementation if we assert that every function has the entrypoints. Currently some unsupported functions will just return failure or some other default value, so this shouldn't change anything. --- openmp/libomptarget/include/PluginManager.h | 8 ++- .../libomptarget/include/Shared/PluginAPI.inc | 72 +++++++++---------- openmp/libomptarget/src/PluginManager.cpp | 4 +- 3 files changed, 43 insertions(+), 41 deletions(-) diff --git a/openmp/libomptarget/include/PluginManager.h b/openmp/libomptarget/include/PluginManager.h index 5e5306ac776f0..77684285ddf15 100644 --- a/openmp/libomptarget/include/PluginManager.h +++ b/openmp/libomptarget/include/PluginManager.h @@ -69,7 +69,7 @@ struct PluginAdaptorTy { /// Access to the shared object file representing the plugin. std::unique_ptr LibraryHandler; -#define PLUGIN_API_HANDLE(NAME, MANDATORY) \ +#define PLUGIN_API_HANDLE(NAME) \ using NAME##_ty = decltype(__tgt_rtl_##NAME); \ NAME##_ty *NAME = nullptr; @@ -114,8 +114,10 @@ struct PluginManager { // Unregister a shared library from all RTLs. void unregisterLib(__tgt_bin_desc *Desc); - void addDeviceImage(__tgt_bin_desc &TgtBinDesc, __tgt_device_image &TgtDeviceImage) { - DeviceImages.emplace_back(std::make_unique(TgtBinDesc, TgtDeviceImage)); + void addDeviceImage(__tgt_bin_desc &TgtBinDesc, + __tgt_device_image &TgtDeviceImage) { + DeviceImages.emplace_back( + std::make_unique(TgtBinDesc, TgtDeviceImage)); } /// Return the device presented to the user as device \p DeviceNo if it is diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc index 3b982e30307ac..e445da6852f7b 100644 --- a/openmp/libomptarget/include/Shared/PluginAPI.inc +++ b/openmp/libomptarget/include/Shared/PluginAPI.inc @@ -13,39 +13,39 @@ // No include guards! -PLUGIN_API_HANDLE(init_plugin, true); -PLUGIN_API_HANDLE(is_valid_binary, true); -PLUGIN_API_HANDLE(is_data_exchangable, false); -PLUGIN_API_HANDLE(number_of_devices, true); -PLUGIN_API_HANDLE(init_device, true); -PLUGIN_API_HANDLE(load_binary, true); -PLUGIN_API_HANDLE(get_global, true); -PLUGIN_API_HANDLE(get_function, true); -PLUGIN_API_HANDLE(data_alloc, true); -PLUGIN_API_HANDLE(data_submit, true); -PLUGIN_API_HANDLE(data_submit_async, false); -PLUGIN_API_HANDLE(data_retrieve, true); -PLUGIN_API_HANDLE(data_retrieve_async, false); -PLUGIN_API_HANDLE(data_exchange, false); -PLUGIN_API_HANDLE(data_exchange_async, false); -PLUGIN_API_HANDLE(data_delete, true); -PLUGIN_API_HANDLE(launch_kernel, true); -PLUGIN_API_HANDLE(init_requires, false); -PLUGIN_API_HANDLE(synchronize, false); -PLUGIN_API_HANDLE(query_async, false); -PLUGIN_API_HANDLE(set_info_flag, false); -PLUGIN_API_HANDLE(print_device_info, false); -PLUGIN_API_HANDLE(create_event, false); -PLUGIN_API_HANDLE(record_event, false); -PLUGIN_API_HANDLE(wait_event, false); -PLUGIN_API_HANDLE(sync_event, false); -PLUGIN_API_HANDLE(destroy_event, false); -PLUGIN_API_HANDLE(init_async_info, false); -PLUGIN_API_HANDLE(init_device_info, false); -PLUGIN_API_HANDLE(data_lock, false); -PLUGIN_API_HANDLE(data_unlock, false); -PLUGIN_API_HANDLE(data_notify_mapped, false); -PLUGIN_API_HANDLE(data_notify_unmapped, false); -PLUGIN_API_HANDLE(set_device_offset, false); -PLUGIN_API_HANDLE(initialize_record_replay, false); -PLUGIN_API_HANDLE(use_auto_zero_copy, false); +PLUGIN_API_HANDLE(init_plugin); +PLUGIN_API_HANDLE(is_valid_binary); +PLUGIN_API_HANDLE(is_data_exchangable); +PLUGIN_API_HANDLE(number_of_devices); +PLUGIN_API_HANDLE(init_device); +PLUGIN_API_HANDLE(load_binary); +PLUGIN_API_HANDLE(get_global); +PLUGIN_API_HANDLE(get_function); +PLUGIN_API_HANDLE(data_alloc); +PLUGIN_API_HANDLE(data_submit); +PLUGIN_API_HANDLE(data_submit_async); +PLUGIN_API_HANDLE(data_retrieve); +PLUGIN_API_HANDLE(data_retrieve_async); +PLUGIN_API_HANDLE(data_exchange); +PLUGIN_API_HANDLE(data_exchange_async); +PLUGIN_API_HANDLE(data_delete); +PLUGIN_API_HANDLE(launch_kernel); +PLUGIN_API_HANDLE(init_requires); +PLUGIN_API_HANDLE(synchronize); +PLUGIN_API_HANDLE(query_async); +PLUGIN_API_HANDLE(set_info_flag); +PLUGIN_API_HANDLE(print_device_info); +PLUGIN_API_HANDLE(create_event); +PLUGIN_API_HANDLE(record_event); +PLUGIN_API_HANDLE(wait_event); +PLUGIN_API_HANDLE(sync_event); +PLUGIN_API_HANDLE(destroy_event); +PLUGIN_API_HANDLE(init_async_info); +PLUGIN_API_HANDLE(init_device_info); +PLUGIN_API_HANDLE(data_lock); +PLUGIN_API_HANDLE(data_unlock); +PLUGIN_API_HANDLE(data_notify_mapped); +PLUGIN_API_HANDLE(data_notify_unmapped); +PLUGIN_API_HANDLE(set_device_offset); +PLUGIN_API_HANDLE(initialize_record_replay); +PLUGIN_API_HANDLE(use_auto_zero_copy); diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp index 09f9c6400569c..928913275332c 100644 --- a/openmp/libomptarget/src/PluginManager.cpp +++ b/openmp/libomptarget/src/PluginManager.cpp @@ -56,10 +56,10 @@ PluginAdaptorTy::PluginAdaptorTy(const std::string &Name, Error PluginAdaptorTy::init() { -#define PLUGIN_API_HANDLE(NAME, MANDATORY) \ +#define PLUGIN_API_HANDLE(NAME) \ NAME = reinterpret_cast( \ LibraryHandler->getAddressOfSymbol(GETNAME(__tgt_rtl_##NAME))); \ - if (MANDATORY && !NAME) { \ + if (!NAME) { \ return createStringError(inconvertibleErrorCode(), \ "Invalid plugin as necessary interface function " \ "(%s) was not found.\n", \ From e3cab8fe82eb71fadb251d11fec7df9fa0dbdd27 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 16:54:03 -0600 Subject: [PATCH 286/351] [LinkerWrapper] Fix test after permitting NVPTX linker arguments Summary: Forgot to change this after a previous patch altered its behaviour. --- clang/test/Driver/linker-wrapper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index 7fd46778ac910..83df2b84adefe 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -21,7 +21,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK -// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o +// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \ @@ -30,7 +30,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG -// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o -g +// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 {{.*}}.o {{.*}}.o -g // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ From 4ebee956455caa0da7783280f8515040eac89d08 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Fri, 23 Feb 2024 06:54:39 +0800 Subject: [PATCH 287/351] [mlir][test] Fix -Wunused-variable in PassBuilderCallbacksTest.cpp (NFC) llvm-project/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp:333:10: error: unused variable 'Ret' [-Werror,-Wunused-variable] bool Ret = MIR->parseMachineFunctions(*Mod, MMI); ^ 1 error generated. --- llvm/unittests/MIR/PassBuilderCallbacksTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp index 4b7d7846b0336..8e3738dc91920 100644 --- a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp +++ b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp @@ -330,7 +330,7 @@ class MachineFunctionCallbacksTest : public testing::Test { Mod->setModuleIdentifier("module"); Mod->setDataLayout(TM.createDataLayout()); - bool Ret = MIR->parseMachineFunctions(*Mod, MMI); + [[maybe_unused]] bool Ret = MIR->parseMachineFunctions(*Mod, MMI); assert(!Ret); return Mod; From e8740d4eb1c88e968b155f73ac745f80b4681589 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 16:59:09 -0600 Subject: [PATCH 288/351] [Clang] Fix missing architecture on CUDA test Summary: Sorry about the churn here, my local git tree got corrupted so a few broken tests slipped by while trying to fix it. --- clang/test/Driver/cuda-cross-compiling.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c index 25058358b63a8..086840accebe7 100644 --- a/clang/test/Driver/cuda-cross-compiling.c +++ b/clang/test/Driver/cuda-cross-compiling.c @@ -71,7 +71,7 @@ // // Test passing arguments directly to nvlink. // -// RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -### %s 2>&1 \ +// RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -march=sm_52 -### %s 2>&1 \ // RUN: | FileCheck -check-prefix=LINKER-ARGS %s // LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b" @@ -87,4 +87,4 @@ // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \ // RUN: | FileCheck -check-prefix=GENERIC %s -// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu" \ No newline at end of file +// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu" From 5bd0c44bd0b944230ba05c87c19292304b84e980 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Thu, 22 Feb 2024 15:22:49 -0800 Subject: [PATCH 289/351] [libc] Match the names of BSD sys/queue.h member names (#82696) While these names are technically internal implemenetation detail, there's an existing code which relies on these details and using different names makes LLVM libc implementation incompatible. Since our goal is for LLVM libc to be a drop in replacement, use the same name as BSD sys/queue.h version. --- .../llvm-libc-macros/sys-queue-macros.h | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/libc/include/llvm-libc-macros/sys-queue-macros.h b/libc/include/llvm-libc-macros/sys-queue-macros.h index 59e6a9a392c97..7da643cb72533 100644 --- a/libc/include/llvm-libc-macros/sys-queue-macros.h +++ b/libc/include/llvm-libc-macros/sys-queue-macros.h @@ -22,12 +22,12 @@ #define SLIST_HEAD(name, type) \ struct name { \ - struct type *first; \ + struct type *slh_first; \ } #define SLIST_CLASS_HEAD(name, type) \ struct name { \ - class type *first; \ + class type *slh_first; \ } #define SLIST_HEAD_INITIALIZER(head) \ @@ -45,8 +45,8 @@ // Singly-linked list access methods. -#define SLIST_EMPTY(head) ((head)->first == NULL) -#define SLIST_FIRST(head) ((head)->first) +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) +#define SLIST_FIRST(head) ((head)->slh_first) #define SLIST_NEXT(elem, field) ((elem)->field.next) #define SLIST_FOREACH(var, head, field) \ @@ -132,18 +132,18 @@ #define STAILQ_HEAD(name, type) \ struct name { \ - struct type *first; \ - struct type **last; \ + struct type *stqh_first; \ + struct type **stqh_last; \ } #define STAILQ_CLASS_HEAD(name, type) \ struct name { \ - class type *first; \ - class type **last; \ + class type *stqh_first; \ + class type **stqh_last; \ } #define STAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).first } + { NULL, &(head).stqh_first } #define STAILQ_ENTRY(type) \ struct { \ @@ -157,12 +157,12 @@ // Singly-linked tail queue access methods. -#define STAILQ_EMPTY(head) ((head)->first == NULL) -#define STAILQ_FIRST(head) ((head)->first) +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) +#define STAILQ_FIRST(head) ((head)->stqh_first) #define STAILQ_LAST(head, type, field) \ (STAILQ_EMPTY(head) \ ? NULL \ - : __containerof((head)->last, QUEUE_TYPEOF(type), field.next)) + : __containerof((head)->stqh_last, QUEUE_TYPEOF(type), field.next)) #define STAILQ_NEXT(elem, field) ((elem)->field.next) #define STAILQ_FOREACH(var, head, field) \ @@ -187,8 +187,8 @@ #define STAILQ_CONCAT(head1, head2, type, field) \ do { \ if (!STAILQ_EMPTY(head2)) { \ - *(head1)->last = (head2)->first; \ - (head1)->last = (head2)->last; \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ STAILQ_INIT(head2); \ } \ } while (0) @@ -196,28 +196,28 @@ #define STAILQ_INIT(head) \ do { \ STAILQ_FIRST(head) = NULL; \ - (head)->last = &STAILQ_FIRST(head); \ + (head)->stqh_last = &STAILQ_FIRST(head); \ } while (0) #define STAILQ_INSERT_AFTER(head, listelem, elem, field) \ do { \ if ((STAILQ_NEXT(elem, field) = STAILQ_NEXT(listelem, field)) == NULL) \ - (head)->last = &STAILQ_NEXT(elem, field); \ + (head)->stqh_last = &STAILQ_NEXT(elem, field); \ STAILQ_NEXT(listelem, field) = (elem); \ } while (0) #define STAILQ_INSERT_HEAD(head, elem, field) \ do { \ if ((STAILQ_NEXT(elem, field) = STAILQ_FIRST(head)) == NULL) \ - (head)->last = &STAILQ_NEXT(elem, field); \ + (head)->stqh_last = &STAILQ_NEXT(elem, field); \ STAILQ_FIRST(head) = (elem); \ } while (0) #define STAILQ_INSERT_TAIL(head, elem, field) \ do { \ STAILQ_NEXT(elem, field) = NULL; \ - *(head)->last = (elem); \ - (head)->last = &STAILQ_NEXT(elem, field); \ + *(head)->stqh_last = (elem); \ + (head)->stqh_last = &STAILQ_NEXT(elem, field); \ } while (0) #define STAILQ_REMOVE(head, elem, type, field) \ @@ -236,27 +236,27 @@ do { \ if ((STAILQ_NEXT(elem, field) = \ STAILQ_NEXT(STAILQ_NEXT(elem, field), field)) == NULL) \ - (head)->last = &STAILQ_NEXT(elem, field); \ + (head)->stqh_last = &STAILQ_NEXT(elem, field); \ } while (0) #define STAILQ_REMOVE_HEAD(head, field) \ do { \ if ((STAILQ_FIRST(head) = STAILQ_NEXT(STAILQ_FIRST(head), field)) == NULL) \ - (head)->last = &STAILQ_FIRST(head); \ + (head)->stqh_last = &STAILQ_FIRST(head); \ } while (0) #define STAILQ_SWAP(head1, head2, type) \ do { \ QUEUE_TYPEOF(type) *first = STAILQ_FIRST(head1); \ - QUEUE_TYPEOF(type) **last = (head1)->last; \ + QUEUE_TYPEOF(type) **last = (head1)->stqh_last; \ STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ - (head1)->last = (head2)->last; \ + (head1)->stqh_last = (head2)->stqh_last; \ STAILQ_FIRST(head2) = first; \ - (head2)->last = last; \ + (head2)->stqh_last = last; \ if (STAILQ_EMPTY(head1)) \ - (head1)->last = &STAILQ_FIRST(head1); \ + (head1)->stqh_last = &STAILQ_FIRST(head1); \ if (STAILQ_EMPTY(head2)) \ - (head2)->last = &STAILQ_FIRST(head2); \ + (head2)->stqh_last = &STAILQ_FIRST(head2); \ } while (0) #endif // __LLVM_LIBC_MACROS_SYS_QUEUE_MACROS_H From aaf2d078b62251b867f37eaa94621dbbbfa0e5b0 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 22 Feb 2024 17:31:37 -0600 Subject: [PATCH 290/351] [Hexagon] Clean up redundant transfer instructions. (#82663) This patch adds a Hexagon specific backend pass that cleans up redundant transfers after register allocation. --- llvm/lib/Target/Hexagon/CMakeLists.txt | 1 + .../Target/Hexagon/HexagonTargetMachine.cpp | 10 + llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp | 324 ++++++++++++++++++ .../Hexagon/atomicrmw-uinc-udec-wrap.ll | 6 +- llvm/test/CodeGen/Hexagon/isel/select-vec.ll | 2 +- llvm/test/CodeGen/Hexagon/reg-by-name.ll | 4 +- llvm/test/CodeGen/Hexagon/tfr-slotindex.ll | 26 ++ 7 files changed, 366 insertions(+), 7 deletions(-) create mode 100644 llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp create mode 100644 llvm/test/CodeGen/Hexagon/tfr-slotindex.ll diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt index 19ccd770f071d..2870f0bb6ad32 100644 --- a/llvm/lib/Target/Hexagon/CMakeLists.txt +++ b/llvm/lib/Target/Hexagon/CMakeLists.txt @@ -62,6 +62,7 @@ add_llvm_target(HexagonCodeGen HexagonTargetMachine.cpp HexagonTargetObjectFile.cpp HexagonTargetTransformInfo.cpp + HexagonTfrCleanup.cpp HexagonVectorCombine.cpp HexagonVectorLoopCarriedReuse.cpp HexagonVectorPrint.cpp diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index f640f76bc47b8..a5ebd64f1f8af 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -65,6 +65,10 @@ static cl::opt EnableExpandCondsets("hexagon-expand-condsets", cl::init(true), cl::Hidden, cl::desc("Early expansion of MUX")); +static cl::opt EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true), + cl::Hidden, + cl::desc("Cleanup of TFRs/COPYs")); + static cl::opt EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden, cl::desc("Enable early if-conversion")); @@ -153,6 +157,7 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", namespace llvm { extern char &HexagonExpandCondsetsID; + extern char &HexagonTfrCleanupID; void initializeHexagonBitSimplifyPass(PassRegistry&); void initializeHexagonConstExtendersPass(PassRegistry&); void initializeHexagonConstPropagationPass(PassRegistry&); @@ -169,6 +174,7 @@ namespace llvm { void initializeHexagonPostIncOptPass(PassRegistry &); void initializeHexagonRDFOptPass(PassRegistry&); void initializeHexagonSplitDoubleRegsPass(PassRegistry&); + void initializeHexagonTfrCleanupPass(PassRegistry &); void initializeHexagonVExtractPass(PassRegistry &); void initializeHexagonVectorCombineLegacyPass(PassRegistry&); void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &); @@ -204,6 +210,7 @@ namespace llvm { FunctionPass *createHexagonSplitConst32AndConst64(); FunctionPass *createHexagonSplitDoubleRegs(); FunctionPass *createHexagonStoreWidening(); + FunctionPass *createHexagonTfrCleanup(); FunctionPass *createHexagonVectorCombineLegacyPass(); FunctionPass *createHexagonVectorPrint(); FunctionPass *createHexagonVExtract(); @@ -258,6 +265,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, (HexagonNoOpt ? CodeGenOptLevel::None : OL)), TLOF(std::make_unique()) { initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry()); + initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry()); initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry()); initAsmInfo(); } @@ -426,6 +434,8 @@ void HexagonPassConfig::addPreRegAlloc() { addPass(createHexagonConstExtenders()); if (EnableExpandCondsets) insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID); + if (EnableTfrCleanup) + insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID); if (!DisableStoreWidening) addPass(createHexagonStoreWidening()); if (EnableGenMemAbs) diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp new file mode 100644 index 0000000000000..a4b359af303a3 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -0,0 +1,324 @@ +//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass is to address a situation that appears after register allocaion +// evey now and then, namely a register copy from a source that was defined +// as an immediate value in the same block (usually just before the copy). +// +// Here is an example of actual code emitted that shows this problem: +// +// .LBB0_5: +// { +// r5 = zxtb(r8) +// r6 = or(r6, ##12345) +// } +// { +// r3 = xor(r1, r2) +// r1 = #0 <-- r1 set to #0 +// } +// { +// r7 = r1 <-- r7 set to r1 +// r0 = zxtb(r3) +// } + +#define DEBUG_TYPE "tfr-cleanup" +#include "HexagonTargetMachine.h" + +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace llvm { +FunctionPass *createHexagonTfrCleanup(); +void initializeHexagonTfrCleanupPass(PassRegistry &); +} // namespace llvm + +namespace { +class HexagonTfrCleanup : public MachineFunctionPass { +public: + static char ID; + HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) { + PassRegistry &R = *PassRegistry::getPassRegistry(); + initializeHexagonTfrCleanupPass(R); + } + StringRef getPassName() const override { return "Hexagon TFR Cleanup"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + const HexagonInstrInfo *HII; + const TargetRegisterInfo *TRI; + + typedef DenseMap ImmediateMap; + + bool isIntReg(unsigned Reg, bool &Is32); + void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap); + bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap); + bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap); + bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes); + bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes); +}; +} // namespace + +char HexagonTfrCleanup::ID = 0; + +namespace llvm { +char &HexagonTfrCleanupID = HexagonTfrCleanup::ID; +} + +bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) { + Is32 = Hexagon::IntRegsRegClass.contains(Reg); + return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg); +} + +// Assign given value V32 to the specified the register R32 in the map. Only +// 32-bit registers are valid arguments. +void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) { + ImmediateMap::iterator F = IMap.find(R32); + if (F == IMap.end()) + IMap.insert(std::make_pair(R32, V32)); + else + F->second = V32; +} + +// Retrieve a value of the provided register Reg and store it into Val. +// Return "true" if a value was found, "false" otherwise. +bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val, + ImmediateMap &IMap) { + bool Is32; + if (!isIntReg(Reg, Is32)) + return false; + + if (Is32) { + ImmediateMap::iterator F = IMap.find(Reg); + if (F == IMap.end()) + return false; + Val = F->second; + return true; + } + + // For 64-bit registers, compose the value from the values of its + // subregisters. + unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo); + unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi); + ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH); + if (FL == IMap.end() || FH == IMap.end()) + return false; + Val = (FH->second << 32) | FL->second; + return true; +} + +// Process an instruction and record the relevant information in the imme- +// diate map. +bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) { + using namespace Hexagon; + + if (MI->isCall()) { + IMap.clear(); + return true; + } + + // If this is an instruction that loads a constant into a register, + // record this information in IMap. + unsigned Opc = MI->getOpcode(); + if (Opc == A2_tfrsi || Opc == A2_tfrpi) { + unsigned DefR = MI->getOperand(0).getReg(); + bool Is32; + if (!isIntReg(DefR, Is32)) + return false; + if (!MI->getOperand(1).isImm()) { + if (!Is32) { + IMap.erase(TRI->getSubReg(DefR, isub_lo)); + IMap.erase(TRI->getSubReg(DefR, isub_hi)); + } else { + IMap.erase(DefR); + } + return false; + } + uint64_t Val = MI->getOperand(1).getImm(); + // If it's a 64-bit register, break it up into subregisters. + if (!Is32) { + uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU); + setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap); + setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap); + } else { + setReg(DefR, Val, IMap); + } + return true; + } + + // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap. + for (MachineInstr::mop_iterator Mo = MI->operands_begin(), + E = MI->operands_end(); + Mo != E; ++Mo) { + if (Mo->isRegMask()) { + IMap.clear(); + return true; + } + if (!Mo->isReg() || !Mo->isDef()) + continue; + unsigned R = Mo->getReg(); + for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) { + ImmediateMap::iterator F = IMap.find(*AR); + if (F != IMap.end()) + IMap.erase(F); + } + } + return true; +} + +// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that +// has a known constant value. +bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, + SlotIndexes *Indexes) { + using namespace Hexagon; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case A2_tfr: + case A2_tfrp: + case COPY: + break; + default: + return false; + } + + unsigned DstR = MI->getOperand(0).getReg(); + unsigned SrcR = MI->getOperand(1).getReg(); + bool Tmp, Is32; + if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp)) + return false; + assert(Tmp == Is32 && "Register size mismatch"); + uint64_t Val; + bool Found = getReg(SrcR, Val, IMap); + if (!Found) + return false; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + int64_t SVal = Is32 ? int32_t(Val) : Val; + auto &HST = B.getParent()->getSubtarget(); + MachineInstr *NewMI; + if (Is32) + NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal); + else if (isInt<8>(SVal)) + NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal); + else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL))) + NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR) + .addImm(int32_t(SVal >> 32)) + .addImm(int32_t(Val & 0xFFFFFFFFLL)); + else if (HST.isTinyCore()) + // Disable generating CONST64 since it requires load resource. + return false; + else + NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val); + + // Replace the MI to reuse the same slot index + if (Indexes) + Indexes->replaceMachineInstrInMaps(*MI, *NewMI); + MI->eraseFromParent(); + return true; +} + +// Remove the instruction if it is a self-assignment. +bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI, + SlotIndexes *Indexes) { + unsigned Opc = MI->getOpcode(); + unsigned DefR, SrcR; + bool IsUndef = false; + switch (Opc) { + case Hexagon::A2_tfr: + // Rd = Rd + DefR = MI->getOperand(0).getReg(); + SrcR = MI->getOperand(1).getReg(); + IsUndef = MI->getOperand(1).isUndef(); + break; + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + // if ([!]Pu) Rd = Rd + DefR = MI->getOperand(0).getReg(); + SrcR = MI->getOperand(2).getReg(); + IsUndef = MI->getOperand(2).isUndef(); + break; + default: + return false; + } + if (DefR != SrcR) + return false; + if (IsUndef) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR); + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef() && Op.isImplicit()) + DefI->addOperand(Op); + } + + if (Indexes) + Indexes->removeMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + return true; +} + +bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + // Map: 32-bit register -> immediate value. + // 64-bit registers are stored through their subregisters. + ImmediateMap IMap; + SlotIndexes *Indexes = this->getAnalysisIfAvailable(); + + auto &HST = MF.getSubtarget(); + HII = HST.getInstrInfo(); + TRI = HST.getRegisterInfo(); + + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock &B = *I; + MachineBasicBlock::iterator J, F, NextJ; + IMap.clear(); + bool Inserted = false, Erased = false; + for (J = B.begin(), F = B.end(); J != F; J = NextJ) { + NextJ = std::next(J); + MachineInstr *MI = &*J; + bool E = eraseIfRedundant(MI, Indexes); + Erased |= E; + if (E) + continue; + Inserted |= rewriteIfImm(MI, IMap, Indexes); + MachineBasicBlock::iterator NewJ = std::prev(NextJ); + updateImmMap(&*NewJ, IMap); + } + bool BlockC = Inserted | Erased; + Changed |= BlockC; + if (BlockC && Indexes) + Indexes->repairIndexesInRange(&B, B.begin(), B.end()); + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// +INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false, + false) + +FunctionPass *llvm::createHexagonTfrCleanup() { + return new HexagonTfrCleanup(); +} diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll index 9d7570b9a929e..d51c9554a022c 100644 --- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll @@ -160,10 +160,8 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: p0 = cmp.gtu(r3:2,r5:4) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: r8 = mux(p0,r8,r1) -; CHECK-NEXT: r9 = mux(p0,r9,r1) +; CHECK-NEXT: if (!p0.new) r8 = add(r1,#0) +; CHECK-NEXT: if (!p0.new) r9 = add(r1,#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: memd_locked(r0,p0) = r9:8 diff --git a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll index 4e54aa4212247..7073c1a2a609a 100644 --- a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll +++ b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s define <4 x i8> @f0(<4 x i8> %a0, <4 x i8> %a1, i32 %a2) #0 { ; CHECK-LABEL: f0: diff --git a/llvm/test/CodeGen/Hexagon/reg-by-name.ll b/llvm/test/CodeGen/Hexagon/reg-by-name.ll index 4abea83ba6dd7..cc8807e4f4d6b 100644 --- a/llvm/test/CodeGen/Hexagon/reg-by-name.ll +++ b/llvm/test/CodeGen/Hexagon/reg-by-name.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-tfr-cleanup=0 < %s | FileCheck %s target triple = "hexagon" @@ -647,7 +647,7 @@ entry: ret i32 %1 } -attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv62" } +attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv73" } attributes #1 = { nounwind } attributes #2 = { nounwind readonly } diff --git a/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll new file mode 100644 index 0000000000000..cebba9476c687 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll @@ -0,0 +1,26 @@ +; Check that after tfr-cleanup COPY to $r0 is converted to tfrsi instruction +; The tfrst instruction must use the same slot index as the COPY instruction +; to avoid breaking live interval information. +; Check that there is no machine verifier crash + +; RUN: llc -stop-after=tfr-cleanup -verify-machineinstrs %s -o - | FileCheck %s + +; CHECK: $r0 = A2_tfrsi 34767 + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; Function Attrs: nounwind optsize +define dso_local i32 @foo() local_unnamed_addr #0 { +entry: + call void @bar(i32 34767) #1 + call void @baz(i32 34767) #1 + ret i32 15 +} + +declare void @bar(i32) local_unnamed_addr + +declare void @baz(i32) local_unnamed_addr + +attributes #0 = { nounwind optsize "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" } +attributes #1 = { noduplicate nomerge nounwind } From 568babab7e769a7793c28aee4f889898bf0bd8ba Mon Sep 17 00:00:00 2001 From: Pavel Iliin Date: Thu, 22 Feb 2024 23:33:54 +0000 Subject: [PATCH 291/351] [AArch64] Implement __builtin_cpu_supports, compiler-rt tests. (#82378) The patch complements https://github.com/llvm/llvm-project/pull/68919 and adds AArch64 support for builtin `__builtin_cpu_supports("feature1+...+featureN")` which return true if all specified CPU features in argument are detected. Also compiler-rt aarch64 native run tests for features detection mechanism were added and 'cpu_model' check was fixed after its refactor merged https://github.com/llvm/llvm-project/pull/75635 Original RFC was https://reviews.llvm.org/D153153 --- clang/lib/Basic/Targets/AArch64.cpp | 8 ++- clang/lib/Basic/Targets/AArch64.h | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 16 ++++++ clang/lib/CodeGen/CodeGenFunction.h | 2 +- .../CodeGen/aarch64-cpu-supports-target.c | 52 ++++++++++++++++++ clang/test/CodeGen/aarch64-cpu-supports.c | 54 +++++++++++++++++++ clang/test/Preprocessor/has_builtin_cpuid.c | 5 -- clang/test/Sema/aarch64-cpu-supports.c | 26 +++++++++ clang/test/Sema/builtin-cpu-supports.c | 2 +- .../builtins/Unit/aarch64_cpu_features_test.c | 17 ++++++ .../test/builtins/Unit/cpu_model_test.c | 2 +- 11 files changed, 176 insertions(+), 10 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-cpu-supports-target.c create mode 100644 clang/test/CodeGen/aarch64-cpu-supports.c create mode 100644 clang/test/Sema/aarch64-cpu-supports.c create mode 100644 compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 68032961451d9..5abb060073c51 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -667,7 +667,13 @@ StringRef AArch64TargetInfo::getFeatureDependencies(StringRef Name) const { } bool AArch64TargetInfo::validateCpuSupports(StringRef FeatureStr) const { - return llvm::AArch64::parseArchExtension(FeatureStr).has_value(); + // CPU features might be separated by '+', extract them and check + llvm::SmallVector Features; + FeatureStr.split(Features, "+"); + for (auto &Feature : Features) + if (!llvm::AArch64::parseArchExtension(Feature.trim()).has_value()) + return false; + return true; } bool AArch64TargetInfo::hasFeature(StringRef Feature) const { diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 26ee7fa197825..c1ba156860a12 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -165,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { DiagnosticsEngine &Diags) override; ParsedTargetAttr parseTargetAttr(StringRef Str) const override; bool supportsTargetAttributeTune() const override { return true; } - + bool supportsCpuSupports() const override { return true; } bool checkArithmeticFenceSupported() const override { return true; } bool hasBFloat16Type() const override; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d8b2115f1e5e3..734eb5a035ca4 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10638,6 +10638,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, BuiltinID <= clang::AArch64::LastSMEBuiltin) return EmitAArch64SMEBuiltinExpr(BuiltinID, E); + if (BuiltinID == Builtin::BI__builtin_cpu_supports) + return EmitAArch64CpuSupports(E); + unsigned HintID = static_cast(-1); switch (BuiltinID) { default: break; @@ -14025,6 +14028,19 @@ Value *CodeGenFunction::EmitX86CpuInit() { return Builder.CreateCall(Func); } +Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) { + const Expr *ArgExpr = E->getArg(0)->IgnoreParenCasts(); + StringRef ArgStr = cast(ArgExpr)->getString(); + llvm::SmallVector Features; + ArgStr.split(Features, "+"); + for (auto &Feature : Features) { + Feature = Feature.trim(); + if (Feature != "default") + Features.push_back(Feature); + } + return EmitAArch64CpuSupports(Features); +} + llvm::Value * CodeGenFunction::EmitAArch64CpuSupports(ArrayRef FeaturesStrs) { uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index caa6a327550ba..92ce0edeaf9e9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -5013,10 +5013,10 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitAArch64CpuInit(); llvm::Value * FormAArch64ResolverCondition(const MultiVersionResolverOption &RO); + llvm::Value *EmitAArch64CpuSupports(const CallExpr *E); llvm::Value *EmitAArch64CpuSupports(ArrayRef FeatureStrs); }; - inline DominatingLLVMValue::saved_type DominatingLLVMValue::save(CodeGenFunction &CGF, llvm::Value *value) { if (!needsSaving(value)) return saved_type(value, false); diff --git a/clang/test/CodeGen/aarch64-cpu-supports-target.c b/clang/test/CodeGen/aarch64-cpu-supports-target.c new file mode 100644 index 0000000000000..e023944b24e53 --- /dev/null +++ b/clang/test/CodeGen/aarch64-cpu-supports-target.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s + +int check_all_feature() { + if (__builtin_cpu_supports("rng+flagm+flagm2+fp16fml+dotprod+sm4")) + return 1; + else if (__builtin_cpu_supports("rdm+lse+fp+simd+crc+sha1+sha2+sha3")) + return 2; + else if (__builtin_cpu_supports("aes+pmull+fp16+dit+dpb+dpb2+jscvt")) + return 3; + else if (__builtin_cpu_supports("fcma+rcpc+rcpc2+rcpc3+frintts+dgh")) + return 4; + else if (__builtin_cpu_supports("i8mm+bf16+ebf16+rpres+sve+sve-bf16")) + return 5; + else if (__builtin_cpu_supports("sve-ebf16+sve-i8mm+f32mm+f64mm")) + return 6; + else if (__builtin_cpu_supports("sve2+sve2-aes+sve2-pmull128")) + return 7; + else if (__builtin_cpu_supports("sve2-bitperm+sve2-sha3+sve2-sm4")) + return 8; + else if (__builtin_cpu_supports("sme+memtag+memtag2+memtag3+sb")) + return 9; + else if (__builtin_cpu_supports("predres+ssbs+ssbs2+bti+ls64+ls64_v")) + return 10; + else if (__builtin_cpu_supports("ls64_accdata+wfxt+sme-f64f64")) + return 11; + else if (__builtin_cpu_supports("sme-i16i64+sme2")) + return 12; + else + return 0; +} + +// CHECK-LABEL: define dso_local i32 @neon_code() #1 +int __attribute__((target("simd"))) neon_code() { return 1; } + +// CHECK-LABEL: define dso_local i32 @sve_code() #2 +int __attribute__((target("sve"))) sve_code() { return 2; } + +// CHECK-LABEL: define dso_local i32 @code() #0 +int code() { return 3; } + +// CHECK-LABEL: define dso_local i32 @test_versions() #0 +int test_versions() { + if (__builtin_cpu_supports("sve")) + return sve_code(); + else if (__builtin_cpu_supports("simd")) + return neon_code(); + else + return code(); +} +// CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon" } +// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } diff --git a/clang/test/CodeGen/aarch64-cpu-supports.c b/clang/test/CodeGen/aarch64-cpu-supports.c new file mode 100644 index 0000000000000..872fec6827ef1 --- /dev/null +++ b/clang/test/CodeGen/aarch64-cpu-supports.c @@ -0,0 +1,54 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 2 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s + +// CHECK: @__aarch64_cpu_features = external dso_local global { i64 } +// CHECK-LABEL: define dso_local i32 @main +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +// CHECK: if.then: +// CHECK-NEXT: store i32 1, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label [[RETURN:%.*]] +// CHECK: if.end: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 9070970929152 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 9070970929152 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label [[IF_THEN1:%.*]], label [[IF_END2:%.*]] +// CHECK: if.then1: +// CHECK-NEXT: store i32 2, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label [[RETURN]] +// CHECK: if.end2: +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 166633186212708352 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 166633186212708352 +// CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] +// CHECK-NEXT: br i1 [[TMP11]], label [[IF_THEN3:%.*]], label [[IF_END4:%.*]] +// CHECK: if.then3: +// CHECK-NEXT: store i32 3, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label [[RETURN]] +// CHECK: if.end4: +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: br label [[RETURN]] +// CHECK: return: +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[RETVAL]], align 4 +// CHECK-NEXT: ret i32 [[TMP12]] +// +int main(void) { + if (__builtin_cpu_supports("sb")) + return 1; + + if (__builtin_cpu_supports("sve2-pmull128+memtag")) + return 2; + + if (__builtin_cpu_supports("sme2+ls64_v+wfxt")) + return 3; + + return 0; +} diff --git a/clang/test/Preprocessor/has_builtin_cpuid.c b/clang/test/Preprocessor/has_builtin_cpuid.c index 8de6331e62d6e..35ef65ecdd9b9 100644 --- a/clang/test/Preprocessor/has_builtin_cpuid.c +++ b/clang/test/Preprocessor/has_builtin_cpuid.c @@ -13,8 +13,3 @@ # error "ARM/PPC shouldn't have __builtin_cpu_init" # endif #endif -#if __has_builtin(__builtin_cpu_supports) -# ifdef ARM -# error "ARM shouldn't have __builtin_cpu_supports" -# endif -#endif diff --git a/clang/test/Sema/aarch64-cpu-supports.c b/clang/test/Sema/aarch64-cpu-supports.c new file mode 100644 index 0000000000000..24aae9542dbc4 --- /dev/null +++ b/clang/test/Sema/aarch64-cpu-supports.c @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify %s + +int test_aarch64_features(void) { + char * ssbs2; + // expected-error@+1 {{expression is not a string literal}} + if (__builtin_cpu_supports(ssbs2)) + return 1; + // expected-error@+1 {{invalid cpu feature string}} + if (__builtin_cpu_supports("")) + return 2; + // expected-error@+1 {{invalid cpu feature string}} + if (__builtin_cpu_supports("pmull128")) + return 3; + // expected-error@+1 {{invalid cpu feature string}} + if (__builtin_cpu_supports("sve2,rpres")) + return 4; + // expected-error@+1 {{invalid cpu feature string}} + if (__builtin_cpu_supports("dgh+sve2-pmull")) + return 5; + // expected-error@+1 {{invalid cpu feature string}} + if (__builtin_cpu_supports("default")) + return 6; + if (__builtin_cpu_supports(" ssbs + bti ")) + return 7; + return 0; +} diff --git a/clang/test/Sema/builtin-cpu-supports.c b/clang/test/Sema/builtin-cpu-supports.c index cc6f1beb5d8a7..733d797f3ff8f 100644 --- a/clang/test/Sema/builtin-cpu-supports.c +++ b/clang/test/Sema/builtin-cpu-supports.c @@ -27,7 +27,7 @@ int main(void) { (void)__builtin_cpu_supports("x86-64-v4"); (void)__builtin_cpu_supports("x86-64-v5"); // expected-error {{invalid cpu feature string for builtin}} #else - if (__builtin_cpu_supports("aes")) // expected-error {{builtin is not supported on this target}} + if (__builtin_cpu_supports("neon")) // expected-error {{invalid cpu feature string for builtin}} a("vsx"); if (__builtin_cpu_is("cortex-x3")) // expected-error {{builtin is not supported on this target}} diff --git a/compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c b/compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c new file mode 100644 index 0000000000000..7ca2710ea2756 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c @@ -0,0 +1,17 @@ +// REQUIRES: aarch64-target-arch +// REQUIRES: native-run +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_aarch64 +int main(void) { + if (__builtin_cpu_supports("fp+simd+pmull+sha2+crc")) { + if (__builtin_cpu_supports("fp") && __builtin_cpu_supports("simd") && + __builtin_cpu_supports("pmull") && __builtin_cpu_supports("sha2") && + __builtin_cpu_supports("crc")) { + return 0; + } else { + // Something wrong in feature detection + return 1; + } + } + return 0; +} diff --git a/compiler-rt/test/builtins/Unit/cpu_model_test.c b/compiler-rt/test/builtins/Unit/cpu_model_test.c index a8b736802f67b..6d5f17aa12565 100644 --- a/compiler-rt/test/builtins/Unit/cpu_model_test.c +++ b/compiler-rt/test/builtins/Unit/cpu_model_test.c @@ -1,6 +1,6 @@ // REQUIRES: x86-target-arch // RUN: %clang_builtins %s %librt -o %t && %run %t -// REQUIRES: librt_has_cpu_model +// REQUIRES: librt_has_x86 // FIXME: XFAIL the test because it is expected to return non-zero value. // XFAIL: * From 2b0f5667e2b40729f714459093eb16cc53fc9e9a Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 22 Feb 2024 23:37:49 +0000 Subject: [PATCH 292/351] [gn build] Port aaf2d078b622 --- llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn index 09b5811d7d122..747ca8f9c91d3 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn @@ -86,6 +86,7 @@ static_library("LLVMHexagonCodeGen") { "HexagonTargetMachine.cpp", "HexagonTargetObjectFile.cpp", "HexagonTargetTransformInfo.cpp", + "HexagonTfrCleanup.cpp", "HexagonVExtract.cpp", "HexagonVLIWPacketizer.cpp", "HexagonVectorCombine.cpp", From d57f158a9546746219e3b01398886e104d8a0fdb Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Thu, 22 Feb 2024 15:54:42 -0800 Subject: [PATCH 293/351] [Tosa] Add Tosa Sin and Cos operators (#82510) - Add Tosa Sin and Cos operators to the MLIR dialect - Define the new Tosa_FloatTensor type --------- Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 40 +++++++++++++++++++ .../mlir/Dialect/Tosa/IR/TosaTypesBase.td | 2 + mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 2 + mlir/test/Dialect/Tosa/ops.mlir | 14 +++++++ 4 files changed, 58 insertions(+) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 0ee9e713724ea..0ecded75c5d8b 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -989,6 +989,26 @@ def Tosa_ClzOp : Tosa_ElementwiseOp<"clz", [SameOperandsAndResultElementType]> { ); } +//===----------------------------------------------------------------------===// +// Operator: cos +//===----------------------------------------------------------------------===// +def Tosa_CosOp : Tosa_ElementwiseOp<"cos", + [SameOperandsAndResultElementType]> { + let summary = "Elementwise cos op"; + + let description = [{ + Elementwise cosine operation for values given in radians. + }]; + + let arguments = (ins + Tosa_FloatTensor:$input + ); + + let results = (outs + Tosa_FloatTensor:$output + ); +} + //===----------------------------------------------------------------------===// // Operator: exp //===----------------------------------------------------------------------===// @@ -1148,6 +1168,26 @@ def Tosa_RsqrtOp : Tosa_ElementwiseOp<"rsqrt", ); } +//===----------------------------------------------------------------------===// +// Operator: sin +//===----------------------------------------------------------------------===// +def Tosa_SinOp : Tosa_ElementwiseOp<"sin", + [SameOperandsAndResultElementType]> { + let summary = "Elementwise sin op"; + + let description = [{ + Elementwise sine operation for values given in radians. + }]; + + let arguments = (ins + Tosa_FloatTensor:$input + ); + + let results = (outs + Tosa_FloatTensor:$output + ); +} + //===----------------------------------------------------------------------===// // TOSA Spec Section 2.6 // Operator Class: Elementwise unary/binary/ternary operators. diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index c55ddaafdda76..5a4d6ff464f19 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -113,6 +113,8 @@ def Tosa_Weight : AnyTypeOf<[Tosa_Int4, Tosa_Int8, def Tosa_Int32Tensor : TensorOf<[Tosa_Int32]>; def Tosa_Int32Or64Tensor : TensorOf<[Tosa_Int32Or64]>; +def Tosa_FloatTensor : TensorOf<[Tosa_Float]>; + // Either ranked or unranked tensor of TOSA supported element types. def Tosa_Tensor : TensorOf<[Tosa_AnyNumber]>; def Tosa_Tensor_Plus_F64 : TensorOf<[Tosa_AnyNumber_Plus_F64]>; diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 950ee597b891b..62d07859e32f6 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -1330,6 +1330,7 @@ NARY_SHAPE_INFER(tosa::CastOp) NARY_SHAPE_INFER(tosa::CeilOp) NARY_SHAPE_INFER(tosa::ClampOp) NARY_SHAPE_INFER(tosa::ClzOp) +NARY_SHAPE_INFER(tosa::CosOp) NARY_SHAPE_INFER(tosa::DivOp) NARY_SHAPE_INFER(tosa::ExpOp) NARY_SHAPE_INFER(tosa::FloorOp) @@ -1352,6 +1353,7 @@ NARY_SHAPE_INFER(tosa::ReciprocalOp) NARY_SHAPE_INFER(tosa::RescaleOp) NARY_SHAPE_INFER(tosa::ReverseOp) NARY_SHAPE_INFER(tosa::RsqrtOp) +NARY_SHAPE_INFER(tosa::SinOp) NARY_SHAPE_INFER(tosa::SelectOp) NARY_SHAPE_INFER(tosa::SubOp) NARY_SHAPE_INFER(tosa::TanhOp) diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 3d68464ebf0b3..01b27072a4b64 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -375,6 +375,13 @@ func.func @test_clz(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { return %0 : tensor<13x21x3xi32> } +// ----- +// CHECK-LABEL: cos +func.func @test_cos(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %0 = tosa.cos %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + // ----- // CHECK-LABEL: exp func.func @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { @@ -424,6 +431,13 @@ func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { return %0 : tensor<13x21x3xf32> } +// ----- +// CHECK-LABEL: sin +func.func @test_sin(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %0 = tosa.sin %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + // ----- // CHECK-LABEL: select func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { From f37c6d55c6a0c695418932a55bac6a517be4a53a Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Thu, 22 Feb 2024 15:55:26 -0800 Subject: [PATCH 294/351] [AMDGPU][NFC] Refactor SIInsertWaitcnts zero waitcnt generation (#82575) Move the allZero* waitcnt generation methods into WaitcntGenerator class. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 28 +++++++++++++++---- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 9 ------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6ecb1c8bf6e1d..a6184c5e1e048 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -480,6 +480,10 @@ class WaitcntGenerator { // WaitEventType to corresponding counter values in InstCounterType. virtual const unsigned *getWaitEventMask() const = 0; + // Returns a new waitcnt with all counters except VScnt set to 0. If + // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; + virtual ~WaitcntGenerator() = default; }; @@ -516,6 +520,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { return WaitEventMaskForInstPreGFX12; } + + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { @@ -549,6 +555,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { return WaitEventMaskForInstGFX12Plus; } + + virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( return Modified; } +AMDGPU::Waitcnt +WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); +} + +AMDGPU::Waitcnt +WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { + return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); +} + /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that /// were added by previous passes. Currently this pass conservatively @@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined( - AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); + Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly @@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { - Wait = Wait.combined( - AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); + Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); + Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); if (ForceEmitWaitcnt[LOAD_CNT]) Wait.LoadCnt = 0; @@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything ScoreBrackets->applyWaitcnt( - AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); + WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else { // May need to way wait for anything. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f35e774452829..b38016a581603 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -870,15 +870,6 @@ struct Waitcnt { : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {} - static Waitcnt allZero(bool Extended, bool HasStorecnt) { - return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0) - : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u); - } - - static Waitcnt allZeroExceptVsCnt(bool Extended) { - return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u); - } - bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } bool hasWaitExceptStoreCnt() const { From cd1d4d8dd31f527615de26f5b62d687c6b2982a6 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Thu, 22 Feb 2024 15:56:13 -0800 Subject: [PATCH 295/351] [mlir][Vector] Add missing CHECK rules to vector-transfer-flatten.mlir (#82698) This test failed after landing #81964 due to a bad merge. I provided a quick fix and this PR is adding the rest of CHECK rules that were not merged properly. --- mlir/test/Dialect/Vector/vector-transfer-flatten.mlir | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir index 2766e782a3fb2..788ae9ac044ed 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -476,6 +476,7 @@ func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, str // CHECK: %[[APPLY:.*]] = affine.apply #[[$MAP]]() // CHECK-128B-LABEL: func @regression_non_contiguous_dim_read( +// CHECK-128B: memref.collapse_shape // ----- @@ -491,3 +492,4 @@ func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>, // CHECK-NOT: memref.collapse_shape // CHECK-128B-LABEL: func @unsupported_non_contiguous_dim_write( +// CHECK-128B-NOT: memref.collapse_shape From ac518c7c9916a6fde1d898b8c53b74298fd00d5f Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Feb 2024 16:17:48 -0800 Subject: [PATCH 296/351] [RISCV] Vector sub (zext, zext) -> sext (sub (zext, zext)) (#82455) This is legal as long as the inner zext retains at least one bit of increase so that the sub overflow case (0 - UINT_MAX) can be represented. Alive2 proof: https://alive2.llvm.org/ce/z/BKeV3W For RVV, restrict this to power of two sizes with the operation type being at least e8 to stick to legal extends. We could arguably handle i1 source types with some care if we wanted to. This is likely profitable because it may allow us to perform the sub instruction in a narrow LMUL (equivalently, in fewer DLEN-sized pieces) before widening for the user. We could arguably avoid narrowing below DLEN, but the transform should at worst introduce one extra extend and one extra vsetvli toggle if the source could previously be handled via loads explicit w/EEW. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 25 ++++++++++++++- .../CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll | 32 +++++++++---------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6bf02cf8c0f87..5c67aaf678566 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -12887,6 +12887,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineSubOfBoolean(N, DAG)) return V; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1) @@ -12894,7 +12895,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, isNullConstant(N1.getOperand(1))) { ISD::CondCode CCVal = cast(N1.getOperand(2))->get(); if (CCVal == ISD::SETLT) { - EVT VT = N->getValueType(0); SDLoc DL(N); unsigned ShAmt = N0.getValueSizeInBits() - 1; return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), @@ -12902,6 +12902,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, } } + // sub (zext, zext) -> sext (sub (zext, zext)) + // where the sum of the extend widths match, and the inner zexts + // add at least one bit. (For profitability on rvv, we use a + // power of two for both inner and outer extend.) + if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) && + N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND && + N0.hasOneUse() && N1.hasOneUse()) { + SDValue Src0 = N0.getOperand(0); + SDValue Src1 = N1.getOperand(0); + EVT SrcVT = Src0.getValueType(); + if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) && + SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 && + SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) { + LLVMContext &C = *DAG.getContext(); + EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C); + EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount()); + Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0); + Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1); + return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, + DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1)); + } + } + // fold (sub x, (select lhs, rhs, cc, 0, y)) -> // (select lhs, rhs, cc, x, (sub x, y)) return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index 574c2652ccfac..a084b5383b403 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -385,12 +385,12 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vzext.vf2 v11, v9 -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -899,12 +899,12 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i32_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vzext.vf2 v11, v9 -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -917,12 +917,12 @@ define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i64_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vzext.vf4 v10, v8 -; CHECK-NEXT: vzext.vf4 v11, v9 -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf4 v8, v10 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -935,12 +935,12 @@ define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwsubu_v2i64_of_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i64_of_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vzext.vf2 v11, v9 -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y From 9e84a22e6989494709d30a03ce9b304956fc0ae2 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 18:22:16 -0600 Subject: [PATCH 297/351] [libc] Silence warnings when building GPU tests (#82701) Summary: This patch silences two warnings that may occur during the building of GPU tests. These are not informative or helpful and just make the test output longer. --- libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 4 ++-- libc/cmake/modules/LLVMLibCTestRules.cmake | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 33ba5da4f8d57..408e25b3469c0 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -158,12 +158,12 @@ function(_get_hermetic_test_compile_options output_var flags) # The GPU build requires overriding the default CMake triple and architecture. if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) list(APPEND compile_options - -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto + -Wno-multi-gpu -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}) elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) list(APPEND compile_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false" - --cuda-path=${LIBC_CUDA_ROOT} + -Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT} -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit) endif() set(${output_var} ${compile_options} PARENT_SCOPE) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 373cbd6853859..1166c26e4d8a5 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -470,6 +470,7 @@ function(add_integration_test test_name) # We need to use the internal object versions for NVPTX. set(internal_suffix ".__internal__") target_link_options(${fq_build_target_name} PRIVATE + "-Wl,--suppress-stack-size-warning" -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static "--cuda-path=${LIBC_CUDA_ROOT}") elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP) @@ -650,6 +651,7 @@ function(add_libc_hermetic_test test_name) # We need to use the internal object versions for NVPTX. set(internal_suffix ".__internal__") target_link_options(${fq_build_target_name} PRIVATE + "-Wl,--suppress-stack-size-warning" -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static "--cuda-path=${LIBC_CUDA_ROOT}") elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP) From 7a5c01dbca3ddfc6dd87775ec90346783c8e2c73 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 22 Feb 2024 18:55:46 -0600 Subject: [PATCH 298/351] [libc] Search the compiler's path for GPU utility tools (#82712) Summary: We need some extra tools for the GPU build. Normally we search for these from the build itself, but in the case of a `LLVM_PROJECTS_BUILD` or some other kind of external build, this directory will not be populated. However, the GPU build already requires that the compiler is an up-to-date clang, which should always have these present next to the binary. Simply add this as a fallback search path. Generally we want it to be the second, because it would pick up someone install and then become stale. --- libc/cmake/modules/prepare_libc_gpu_build.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake index 75beef86760c8..752182f67cc01 100644 --- a/libc/cmake/modules/prepare_libc_gpu_build.cmake +++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake @@ -17,9 +17,10 @@ if(NOT LLVM_LIBC_FULL_BUILD) endif() # Identify the program used to package multiple images into a single binary. +get_filename_component(compiler_path ${CMAKE_CXX_COMPILER} DIRECTORY) find_program(LIBC_CLANG_OFFLOAD_PACKAGER NAMES clang-offload-packager NO_DEFAULT_PATH - PATHS ${LLVM_BINARY_DIR}/bin) + PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path}) if(NOT LIBC_CLANG_OFFLOAD_PACKAGER) message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU " "build") @@ -45,7 +46,7 @@ elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) # Using 'check_cxx_compiler_flag' does not work currently due to the link job. find_program(LIBC_NVPTX_ARCH NAMES nvptx-arch NO_DEFAULT_PATH - PATHS ${LLVM_BINARY_DIR}/bin) + PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path}) if(LIBC_NVPTX_ARCH) execute_process(COMMAND ${LIBC_NVPTX_ARCH} OUTPUT_VARIABLE arch_tool_output From 590c968e7943e51bb00ff75d312435f24d983b2a Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 22 Feb 2024 17:27:28 -0800 Subject: [PATCH 299/351] [NVPTX] fixup support for unaligned parameters and returns (#82562) Add support for unaligned parameters and return values. These must be loaded and stored one byte at a time and then bit manipulation is used to assemble the correct final result. --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 30 ++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 257 +++++++++++- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 4 + llvm/test/CodeGen/NVPTX/param-load-store.ll | 93 ++++- .../NVPTX/unaligned-param-load-store.ll | 385 ++++++++++++++++++ 5 files changed, 730 insertions(+), 39 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ded2f2584014d..3ff8994602e16 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); + if (Opcode == NVPTX::StoreRetvalI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreRetvalI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, @@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { NVPTX::StoreParamI8, NVPTX::StoreParamI16, NVPTX::StoreParamI32, NVPTX::StoreParamI64, NVPTX::StoreParamF32, NVPTX::StoreParamF64); + if (Opcode == NVPTX::StoreParamI8) { + // Fine tune the opcode depending on the size of the operand. + // This helps to avoid creating redundant COPY instructions in + // InstrEmitter::AddRegisterOperand(). + switch (Ops[0].getSimpleValueType().SimpleTy) { + default: + break; + case MVT::i32: + Opcode = NVPTX::StoreParamI8TruncI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreParamI8TruncI64; + break; + } + } break; case 2: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7d2fe78d14229..66a101036f913 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -47,6 +47,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -59,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } +static bool adjustElementType(EVT &ElementType) { + switch (ElementType.getSimpleVT().SimpleTy) { + default: + return false; + case MVT::f16: + case MVT::bf16: + ElementType = MVT::i16; + return true; + case MVT::f32: + case MVT::v2f16: + case MVT::v2bf16: + ElementType = MVT::i32; + return true; + case MVT::f64: + ElementType = MVT::i64; + return true; + } +} + +// Use byte-store when the param address of the argument value is unaligned. +// This may happen when the return value is a field of a packed structure. +// +// This is called in LowerCall() when passing the param values. +static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue StVal, SDValue &InGlue, + unsigned ArgID, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); + + // Store each byte + SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal, InGlue}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode( + NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, + MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); + InGlue = Chain.getValue(1); + } + return Chain; +} + +// Use byte-load when the param adress of the returned value is unaligned. +// This may happen when the returned value is a field of a packed structure. +static SDValue +LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, + EVT ElementType, SDValue &InGlue, + SmallVectorImpl &TempProxyRegOps, + const SDLoc &dl) { + // Bit logic only works on integer types + EVT MergedType = ElementType; + adjustElementType(MergedType); + + // Load each byte and construct the whole value. Initial value to 0 + SDValue RetVal = DAG.getConstant(0, dl, MergedType); + // LoadParamMemI8 loads into i16 register only + SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offset + i, dl, MVT::i32), + InGlue}; + // This will be selected to LoadParamMemI8 + SDValue LdVal = + DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, + MVT::i8, MachinePointerInfo(), Align(1)); + SDValue TmpLdVal = LdVal.getValue(0); + Chain = LdVal.getValue(1); + InGlue = LdVal.getValue(2); + + TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, + TmpLdVal.getSimpleValueType(), TmpLdVal); + TempProxyRegOps.push_back(TmpLdVal); + + SDValue CMask = DAG.getConstant(255, dl, MergedType); + SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); + // Need to extend the i16 register to the whole width. + TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); + // Mask off the high bits. Leave only the lower 8bits. + // Do this because we are using loadparam.b8. + TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); + // Shift and merge + TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); + RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); + } + if (ElementType != MergedType) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + return RetVal; +} + SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (NeedAlign) PartAlign = commonAlignment(ArgAlign, CurOffset); - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back( - DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant( - IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), - dl, MVT::i32)); - } - SDValue StVal = OutVals[OIdx]; MVT PromotedVT; @@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar store. In such cases, fall back to byte stores. + if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && + PartAlign.value() < + DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + Chain = LowerUnalignedStoreParam( + DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT, + StVal, InGlue, ParamCount, dl); + + // LowerUnalignedStoreParam took care of inserting the necessary nodes + // into the SDAG, so just move on to the next element. + if (!IsByVal) + ++OIdx; + continue; + } + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back( + DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); + + StoreOperands.push_back(DAG.getConstant( + IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), + dl, MVT::i32)); + } + // Record the value to store. StoreOperands.push_back(StVal); @@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector ProxyRegOps; SmallVector, 16> ProxyRegTruncates; + // An item of the vector is filled if the element does not need a ProxyReg + // operation on it and should be added to InVals as is. ProxyRegOps and + // ProxyRegTruncates contain empty/none items at the same index. + SmallVector RetElts; + // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` + // to use the values of `LoadParam`s and to be replaced later then + // `CALLSEQ_END` is added. + SmallVector TempProxyRegOps; // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { @@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EltType = MVT::i16; } + // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a + // scalar load. In such cases, fall back to byte loads. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && + EltAlign < DL.getABITypeAlign( + TheLoadType.getTypeForEVT(*DAG.getContext()))) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + SDValue Ret = LowerUnalignedLoadRetParam( + DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl); + ProxyRegOps.push_back(SDValue()); + ProxyRegTruncates.push_back(std::optional()); + RetElts.resize(i); + RetElts.push_back(Ret); + + continue; + } + // Record index of the very first element of the vector. if (VectorInfo[i] & PVF_FIRST) { assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); @@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { + if (i < RetElts.size() && RetElts[i]) { + InVals.push_back(RetElts[i]); + continue; + } + SDValue Ret = DAG.getNode( NVPTXISD::ProxyReg, dl, DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), @@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InVals.push_back(Ret); } + for (SDValue &T : TempProxyRegOps) { + SDValue Repl = DAG.getNode( + NVPTXISD::ProxyReg, dl, + DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), + {Chain, T.getOperand(0), InGlue}); + DAG.ReplaceAllUsesWith(T, Repl); + DAG.RemoveDeadNode(T.getNode()); + + Chain = Repl.getValue(1); + InGlue = Repl.getValue(2); + } + // set isTailCall to false for now, until we figure out how to express // tail call optimization in PTX isTailCall = false; @@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); Value *srcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + + const MaybeAlign PartAlign = [&]() -> MaybeAlign { + if (aggregateIsPacked) + return Align(1); + if (NumElts != 1) + return std::nullopt; + Align PartAlign = + (Offsets[parti] == 0 && PAL.getParamAlignment(i)) + ? PAL.getParamAlignment(i).value() + : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); + return commonAlignment(PartAlign, Offsets[parti]); + }(); SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, - MachinePointerInfo(srcValue), - MaybeAlign(aggregateIsPacked ? 1 : 0), + MachinePointerInfo(srcValue), PartAlign, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); if (P.getNode()) @@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( return Chain; } +// Use byte-store when the param adress of the return value is unaligned. +// This may happen when the return value is a field of a packed structure. +static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, + uint64_t Offset, EVT ElementType, + SDValue RetVal, const SDLoc &dl) { + // Bit logic only works on integer types + if (adjustElementType(ElementType)) + RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); + + // Store each byte + for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { + // Shift the byte to the last byte position + SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, + DAG.getConstant(i * 8, dl, MVT::i32)); + SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), + ShiftVal}; + // Trunc store only the last byte by using + // st.param.b8 + // The register type can be larger than b8. + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, + DAG.getVTList(MVT::Other), StoreOperands, + MVT::i8, MachinePointerInfo(), std::nullopt, + MachineMemOperand::MOStore); + } + return Chain; +} + SDValue NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector StoreOperands; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { - // New load/store. Record chain and offset operands. - if (VectorInfo[i] & PVF_FIRST) { - assert(StoreOperands.empty() && "Orphaned operand list."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); - } - SDValue OutVal = OutVals[i]; SDValue RetVal = PromotedOutVals[i]; @@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); } + // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned + // for a scalar store. In such cases, fall back to byte stores. + if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { + EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Align ElementTypeAlign = + DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); + Align ElementAlign = + commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]); + if (ElementAlign < ElementTypeAlign) { + assert(StoreOperands.empty() && "Orphaned operand list."); + Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType, + RetVal, dl); + + // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes + // into the graph, so just move on to the next element. + continue; + } + } + + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } + // Record the value to return. StoreOperands.push_back(RetVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 55a1955a7f497..b3517ce066b87 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2738,6 +2738,8 @@ def StoreParamI32 : StoreParamInst; def StoreParamI16 : StoreParamInst; def StoreParamI8 : StoreParamInst; +def StoreParamI8TruncI32 : StoreParamInst; +def StoreParamI8TruncI64 : StoreParamInst; def StoreParamV2I64 : StoreParamV2Inst; def StoreParamV2I32 : StoreParamV2Inst; def StoreParamV2I16 : StoreParamV2Inst; @@ -2757,6 +2759,8 @@ def StoreRetvalI64 : StoreRetvalInst; def StoreRetvalI32 : StoreRetvalInst; def StoreRetvalI16 : StoreRetvalInst; def StoreRetvalI8 : StoreRetvalInst; +def StoreRetvalI8TruncI32 : StoreRetvalInst; +def StoreRetvalI8TruncI64 : StoreRetvalInst; def StoreRetvalV2I64 : StoreRetvalV2Inst; def StoreRetvalV2I32 : StoreRetvalV2Inst; def StoreRetvalV2I16 : StoreRetvalV2Inst; diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index c14dc88431d31..a29d4e1875cd7 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; ; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; -; --- TODO -; --- Unaligned parameter store/ return value load is broken in both nvcc -; --- and llvm and needs to be fixed. ; CHECK: .param .align 1 .b8 param0[25]; -; CHECK-DAG: st.param.b32 [param0+0], -; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+0], +; CHECK-DAG: st.param.b8 [param0+1], +; CHECK-DAG: st.param.b8 [param0+2], +; CHECK-DAG: st.param.b8 [param0+3], +; CHECK-DAG: st.param.b8 [param0+4], +; CHECK-DAG: st.param.b8 [param0+5], +; CHECK-DAG: st.param.b8 [param0+6], +; CHECK-DAG: st.param.b8 [param0+7], ; CHECK-DAG: st.param.b8 [param0+8], -; CHECK-DAG: st.param.b32 [param0+9], -; CHECK-DAG: st.param.b32 [param0+13], -; CHECK-DAG: st.param.b64 [param0+17], +; CHECK-DAG: st.param.b8 [param0+9], +; CHECK-DAG: st.param.b8 [param0+10], +; CHECK-DAG: st.param.b8 [param0+11], +; CHECK-DAG: st.param.b8 [param0+12], +; CHECK-DAG: st.param.b8 [param0+13], +; CHECK-DAG: st.param.b8 [param0+14], +; CHECK-DAG: st.param.b8 [param0+15], +; CHECK-DAG: st.param.b8 [param0+16], +; CHECK-DAG: st.param.b8 [param0+17], +; CHECK-DAG: st.param.b8 [param0+18], +; CHECK-DAG: st.param.b8 [param0+19], +; CHECK-DAG: st.param.b8 [param0+20], +; CHECK-DAG: st.param.b8 [param0+21], +; CHECK-DAG: st.param.b8 [param0+22], +; CHECK-DAG: st.param.b8 [param0+23], +; CHECK-DAG: st.param.b8 [param0+24], ; CHECK: .param .align 1 .b8 retval0[25]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i1i32x4p, -; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; -; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; -; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; -; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; -; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; -; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; -; CHECK-DAG: st.param.b32 [func_retval0+0], -; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+0]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+5]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+6]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+7]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+9]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+10]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+11]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+12]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+13]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+14]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+15]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+16]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+17]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+18]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+19]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+20]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+21]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+22]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23]; +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b8 [func_retval0+0], +; CHECK-DAG: st.param.b8 [func_retval0+1], +; CHECK-DAG: st.param.b8 [func_retval0+2], +; CHECK-DAG: st.param.b8 [func_retval0+3], +; CHECK-DAG: st.param.b8 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], ; CHECK-DAG: st.param.b8 [func_retval0+8], -; CHECK-DAG: st.param.b32 [func_retval0+9], -; CHECK-DAG: st.param.b32 [func_retval0+13], -; CHECK-DAG: st.param.b64 [func_retval0+17], +; CHECK-DAG: st.param.b8 [func_retval0+9], +; CHECK-DAG: st.param.b8 [func_retval0+10], +; CHECK-DAG: st.param.b8 [func_retval0+11], +; CHECK-DAG: st.param.b8 [func_retval0+12], +; CHECK-DAG: st.param.b8 [func_retval0+13], +; CHECK-DAG: st.param.b8 [func_retval0+14], +; CHECK-DAG: st.param.b8 [func_retval0+15], +; CHECK-DAG: st.param.b8 [func_retval0+16], +; CHECK-DAG: st.param.b8 [func_retval0+17], +; CHECK-DAG: st.param.b8 [func_retval0+18], +; CHECK-DAG: st.param.b8 [func_retval0+19], +; CHECK-DAG: st.param.b8 [func_retval0+20], +; CHECK-DAG: st.param.b8 [func_retval0+21], +; CHECK-DAG: st.param.b8 [func_retval0+22], +; CHECK-DAG: st.param.b8 [func_retval0+23], +; CHECK-DAG: st.param.b8 [func_retval0+24], define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll new file mode 100644 index 0000000000000..40a3e9e945a23 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -0,0 +1,385 @@ +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %} + +%s_i8i16p = type { <{ i16, i8, i16 }>, i64 } +%s_i8i32p = type { <{ i32, i8, i32 }>, i64 } +%s_i8i64p = type { <{ i64, i8, i64 }>, i64 } +%s_i8f16p = type { <{ half, i8, half }>, i64 } +%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 } +%s_i8f32p = type { <{ float, i8, float }>, i64 } +%s_i8f64p = type { <{ double, i8, double }>, i64 } + +; -- All loads/stores from parameters aligned by one must be done one +; byte at a time. +; -- Notes: +; -- There are two fields of interest in the packed part of the struct, one +; with a proper offset and one without. The former should be loaded or +; stored as a whole, and the latter by bytes. +; -- Only loading and storing the said fields are checked in the following +; series of tests so that they are more concise. + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16]) +; CHECK-LABEL: test_s_i8i16p( +; CHECK: .param .align 8 .b8 test_s_i8i16p_param_0[16] +; CHECK-DAG: ld.param.u16 [[P0:%rs[0-9]+]], [test_s_i8i16p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8i16p_param_0+3]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8i16p_param_0+4]; +; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[16]; +; CHECK-DAG: st.param.b16 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]]; +; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]]; +; CHECK: .param .align 8 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8i16p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+3]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+4]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]]; +; CHECK-DAG: shl.b16 [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8; +; CHECK-DAG: and.b16 [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255; +; CHECK-DAG: or.b16 [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]]; +; CHECK-DAG: st.param.b8 [func_retval0+3], [[R2]]; +; CHECK-DAG: and.b16 [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255; +; CHECK-DAG: st.param.b8 [func_retval0+4], [[R2_1_and]]; +; CHECK: ret; + +define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { + %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) + ret %s_i8i16p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i8i32p( +; CHECK: .param .align 8 .b8 test_s_i8i32p_param_0[24] +; CHECK-DAG: ld.param.u32 [[P0:%r[0-9]+]], [test_s_i8i32p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8i32p_param_0+5]; +; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8i32p_param_0+6]; +; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8i32p_param_0+7]; +; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8i32p_param_0+8]; +; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]]; +; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; +; CHECK: { // callseq +; CHECK-DAG: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; +; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; +; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8i32p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK: ret; + +define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { + %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) + ret %s_i8i32p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i8i64p( +; CHECK: .param .align 8 .b8 test_s_i8i64p_param_0[32] +; CHECK-DAG: ld.param.u64 [[P0:%rd[0-9]+]], [test_s_i8i64p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8i64p_param_0+9]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8i64p_param_0+10]; +; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8i64p_param_0+11]; +; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8i64p_param_0+12]; +; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8i64p_param_0+13]; +; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8i64p_param_0+14]; +; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8i64p_param_0+15]; +; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8i64p_param_0+16]; +; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b64 [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b64 [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b64 [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]]; +; CHECK-DAG: shl.b64 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8; +; CHECK-DAG: shl.b64 [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16; +; CHECK-DAG: shl.b64 [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24; +; CHECK-DAG: or.b64 [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]]; +; CHECK-DAG: or.b64 [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]]; +; CHECK-DAG: or.b64 [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]]; +; CHECK-DAG: shl.b64 [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32; +; CHECK-DAG: or.b64 [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]]; +; CHECK-DAG: shr.u64 [[P2_shr_1:%rd[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u64 [[P2_shr_2:%rd[0-9]+]], [[P2]], 16; +; CHECK-DAG: shr.u64 [[P2_shr_3:%rd[0-9]+]], [[P2]], 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16; +; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK-DAG: st.param.b64 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; +; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; +; CHECK-DAG: st.param.b8 [param0+12], [[P2_shr_3]]; +; CHECK-DAG: st.param.b8 [param0+13], [[P2_or_5]]; +; CHECK-DAG: st.param.b8 [param0+14], [[P2_bfe_4]]; +; CHECK-DAG: st.param.b8 [param0+15], [[P2_bfe_5]]; +; CHECK-DAG: st.param.b8 [param0+16], [[P2_bfe_6]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8i64p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b64 [[R0:%rd[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b8 [[R2_4:%rs[0-9]+]], [retval0+13]; +; CHECK-DAG: ld.param.b8 [[R2_5:%rs[0-9]+]], [retval0+14]; +; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; +; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b64 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+9], +; CHECK-DAG: st.param.b8 [func_retval0+10], +; CHECK-DAG: st.param.b8 [func_retval0+11], +; CHECK-DAG: st.param.b8 [func_retval0+12], +; CHECK-DAG: st.param.b8 [func_retval0+13], +; CHECK-DAG: st.param.b8 [func_retval0+14], +; CHECK-DAG: st.param.b8 [func_retval0+15], +; CHECK-DAG: st.param.b8 [func_retval0+16], +; CHECK: ret; + +define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { + %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) + ret %s_i8i64p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16]) +; CHECK-LABEL: test_s_i8f16p( +; CHECK: .param .align 8 .b8 test_s_i8f16p_param_0[16] +; CHECK-DAG: ld.param.b16 [[P0:%rs[0-9]+]], [test_s_i8f16p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8f16p_param_0+3]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8f16p_param_0+4]; +; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[16]; +; CHECK-DAG: st.param.b16 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+3], [[P2_1_or]]; +; CHECK-DAG: st.param.b8 [param0+4], [[P2_1]]; +; CHECK: .param .align 8 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f16p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b16 [[R0:%rs[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2I_0:%rs[0-9]+]], [retval0+3]; +; CHECK-DAG: ld.param.b8 [[R2I_1:%rs[0-9]+]], [retval0+4]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b16 [func_retval0+0], [[R0]]; +; CHECK-DAG: shl.b16 [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8; +; CHECK-DAG: and.b16 [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255; +; CHECK-DAG: or.b16 [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]]; +; CHECK-DAG: st.param.b8 [func_retval0+3], [[R2I]]; +; CHECK-DAG: and.b16 [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255; +; CHECK-DAG: st.param.b8 [func_retval0+4], [[R2I_1_and]]; +; CHECK: ret; + +define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { + %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) + ret %s_i8f16p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i8f16x2p( +; CHECK: .param .align 8 .b8 test_s_i8f16x2p_param_0[24] +; CHECK-DAG: ld.param.b32 [[P0:%r[0-9]+]], [test_s_i8f16x2p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f16x2p_param_0+5]; +; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f16x2p_param_0+6]; +; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f16x2p_param_0+7]; +; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f16x2p_param_0+8]; +; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]]; +; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; +; CHECK: { // callseq +; CHECK-DAG: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; +; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; +; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f16x2p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.b32 [[R0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; +; CHECK: } // callseq +; CHECK-DAG: st.param.b32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK: ret; + +define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { + %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) + ret %s_i8f16x2p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i8f32p( +; CHECK: .param .align 8 .b8 test_s_i8f32p_param_0[24] +; CHECK-DAG: ld.param.f32 [[P0:%f[0-9]+]], [test_s_i8f32p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f32p_param_0+5]; +; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f32p_param_0+6]; +; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f32p_param_0+7]; +; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f32p_param_0+8]; +; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b32 [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b32 [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b32 [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]]; +; CHECK-DAG: shr.u32 [[P2_1_shr:%r[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; +; CHECK: { // callseq +; CHECK-DAG: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.f32 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; +; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; +; CHECK-DAG: st.param.b8 [param0+8], [[P2_3]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f32p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.f32 [[R0:%f[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; +; CHECK: } // callseq +; CHECK-DAG: st.param.f32 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+5], +; CHECK-DAG: st.param.b8 [func_retval0+6], +; CHECK-DAG: st.param.b8 [func_retval0+7], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK: ret; + +define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { + %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) + ret %s_i8f32p %r +} + +; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i8f64p( +; CHECK: .param .align 8 .b8 test_s_i8f64p_param_0[32] +; CHECK-DAG: ld.param.f64 [[P0:%fd[0-9]+]], [test_s_i8f64p_param_0]; +; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8f64p_param_0+9]; +; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8f64p_param_0+10]; +; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8f64p_param_0+11]; +; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8f64p_param_0+12]; +; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8f64p_param_0+13]; +; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8f64p_param_0+14]; +; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8f64p_param_0+15]; +; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8f64p_param_0+16]; +; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8; +; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16; +; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24; +; CHECK-DAG: or.b64 [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]]; +; CHECK-DAG: or.b64 [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]]; +; CHECK-DAG: or.b64 [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]]; +; CHECK-DAG: shl.b64 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8; +; CHECK-DAG: shl.b64 [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16; +; CHECK-DAG: shl.b64 [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24; +; CHECK-DAG: or.b64 [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]]; +; CHECK-DAG: or.b64 [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]]; +; CHECK-DAG: or.b64 [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]]; +; CHECK-DAG: shl.b64 [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32; +; CHECK-DAG: or.b64 [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]]; +; CHECK-DAG: shr.u64 [[P2_shr_1:%rd[0-9]+]], [[P2]], 8; +; CHECK-DAG: shr.u64 [[P2_shr_2:%rd[0-9]+]], [[P2]], 16; +; CHECK-DAG: shr.u64 [[P2_shr_3:%rd[0-9]+]], [[P2]], 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24; +; CHECK-DAG: bfe.u64 [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16; +; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; +; CHECK: { // callseq +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK-DAG: st.param.f64 [param0+0], [[P0]]; +; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; +; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; +; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; +; CHECK-DAG: st.param.b8 [param0+12], [[P2_shr_3]]; +; CHECK-DAG: st.param.b8 [param0+13], [[P2_or_5]]; +; CHECK-DAG: st.param.b8 [param0+14], [[P2_bfe_4]]; +; CHECK-DAG: st.param.b8 [param0+15], [[P2_bfe_5]]; +; CHECK-DAG: st.param.b8 [param0+16], [[P2_bfe_6]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8f64p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-DAG: ld.param.f64 [[R0:%fd[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; +; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; +; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; +; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b8 [[R2_4:%rs[0-9]+]], [retval0+13]; +; CHECK-DAG: ld.param.b8 [[R2_5:%rs[0-9]+]], [retval0+14]; +; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; +; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; +; CHECK: } // callseq +; CHECK-DAG: st.param.f64 [func_retval0+0], [[R0]]; +; CHECK-DAG: st.param.b8 [func_retval0+9], +; CHECK-DAG: st.param.b8 [func_retval0+10], +; CHECK-DAG: st.param.b8 [func_retval0+11], +; CHECK-DAG: st.param.b8 [func_retval0+12], +; CHECK-DAG: st.param.b8 [func_retval0+13], +; CHECK-DAG: st.param.b8 [func_retval0+14], +; CHECK-DAG: st.param.b8 [func_retval0+15], +; CHECK-DAG: st.param.b8 [func_retval0+16], +; CHECK: ret; + +define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { + %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) + ret %s_i8f64p %r +} From 19e518d2623c0e87a87ebf30405e74448bd1ee70 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Fri, 23 Feb 2024 09:36:32 +0800 Subject: [PATCH 300/351] [Clang][Parser] Have the depth of the abbreviated generic lambdas inside a requires clause differ from the surrounding generic lambda (#80656) A one-line fix, again : ) This fixes https://github.com/llvm/llvm-project/issues/78524 and the similar example at https://github.com/llvm/llvm-project/issues/78524#issuecomment-1899886951. We previously increased the template depth by one after parsing the attaching requires-clause on a lambda expression. This led to a problem where the 'auto' parameters of nested abbreviated generic lambdas, inside of a requires-expression, had the same depth as the template parameters of the surrounding lambda. Consequently, during the concept-checking stage, we ended up substituting these parameters with the wrong template arguments because they were at different levels. --- clang/docs/ReleaseNotes.rst | 4 +++ clang/lib/Parse/ParseExprCXX.cpp | 11 +++++++- .../Parser/cxx-concepts-requires-clause.cpp | 27 +++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 19cc5b7756431..529dd783ab738 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -277,6 +277,10 @@ Bug Fixes to C++ Support (`#82258 `_) - Correctly immediate-escalate lambda conversion functions. (`#82258 `_) +- Fixed an issue where template parameters of a nested abbreviated generic lambda within + a requires-clause lie at the same depth as those of the surrounding lambda. This, + in turn, results in the wrong template argument substitution during constraint checking. + (`#78524 `_) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index fd262ff31e661..22ee60af4616d 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -1385,6 +1385,16 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer( Diag(RAngleLoc, diag::err_lambda_template_parameter_list_empty); } else { + // We increase the template depth before recursing into a requires-clause. + // + // This depth is used for setting up a LambdaScopeInfo (in + // Sema::RecordParsingTemplateParameterDepth), which is used later when + // inventing template parameters in InventTemplateParameter. + // + // This way, abbreviated generic lambdas could have different template + // depths, avoiding substitution into the wrong template parameters during + // constraint satisfaction check. + ++CurTemplateDepthTracker; ExprResult RequiresClause; if (TryConsumeToken(tok::kw_requires)) { RequiresClause = @@ -1396,7 +1406,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer( Actions.ActOnLambdaExplicitTemplateParameterList( Intro, LAngleLoc, TemplateParams, RAngleLoc, RequiresClause); - ++CurTemplateDepthTracker; } } diff --git a/clang/test/Parser/cxx-concepts-requires-clause.cpp b/clang/test/Parser/cxx-concepts-requires-clause.cpp index 1ec1eefa12865..5b5bc9ea978bf 100644 --- a/clang/test/Parser/cxx-concepts-requires-clause.cpp +++ b/clang/test/Parser/cxx-concepts-requires-clause.cpp @@ -168,3 +168,30 @@ auto lambda4 = [] requires(sizeof(char) == 1){}; // expected-error {{expected bo #if __cplusplus <= 202002L // expected-warning@-2{{lambda without a parameter clause is a C++23 extension}} #endif + +namespace GH78524 { + +template T Foo; + +template auto C(Foo); + +template struct D { + decltype(T()(C)) Type; +}; + +template D G(T, U) { return {}; } + +struct E {}; + +void F() { + G([] +// ~~~~~~~~~~ T: Depth: 0, Index: 0 + requires requires { [](auto...) {}; }(T) +// ~~~~ auto: Depth: 1, Index: 0 + { return T(); }, + E{}); +} + +int a = [] requires requires { [](auto){}; } { return 0; }(); + +} // namespace GH78524 From 5ccf54640a2bdb6f36f65c574feb312da7f75243 Mon Sep 17 00:00:00 2001 From: huaatian <142874007+huaatian@users.noreply.github.com> Date: Fri, 23 Feb 2024 10:25:02 +0800 Subject: [PATCH 301/351] [llvm][cmake] Performing expensive checks requires enabling assert. (#80821) LLVM will intercept errors using assert() when LLVM_ENABLE_EXPENSIVE_CHECKS is ON. So an explicit check is added. --------- Co-authored-by: Hua Tian --- llvm/cmake/modules/HandleLLVMOptions.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 4257083e53ad4..40316b11ceed9 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -128,6 +128,11 @@ if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR endif() if(LLVM_ENABLE_EXPENSIVE_CHECKS) + # When LLVM_ENABLE_EXPENSIVE_CHECKS is ON, LLVM will intercept errors + # using assert(). An explicit check is performed here. + if (NOT LLVM_ENABLE_ASSERTIONS) + message(FATAL_ERROR "LLVM_ENABLE_EXPENSIVE_CHECKS requires LLVM_ENABLE_ASSERTIONS \"ON\".") + endif() add_compile_definitions(EXPENSIVE_CHECKS) # In some libstdc++ versions, std::min_element is not constexpr when From 2e5af56b05c2d39ab2c829bf4c13190523b67ddd Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Fri, 23 Feb 2024 10:59:46 +0800 Subject: [PATCH 302/351] [C++20] [Modules] Allow to compile a pcm with and without -fPIC seperately We can compile a module unit in 2 phase compilaton: ``` clang++ -std=c++20 a.cppm --precompile -o a.pcm clang++ -std=c++20 a.pcm -c -o a.o ``` And it is a general requirement that we need to compile a translation unit with and without -fPIC for static and shared libraries. But for C++20 modules with 2 phase compilation, it may be waste of time to compile them 2 times completely. It may be fine to generate one BMI and compile it with and without -fPIC seperately. e.g., ``` clang++ -std=c++20 a.cppm --precompile -o a.pcm clang++ -std=c++20 a.pcm -c -o a.o clang++ -std=c++20 a.pcm -c -fPIC -o a-PIC.o ``` Then we can save the time to parse a.cppm repeatedly. --- clang/include/clang/Frontend/ASTUnit.h | 23 +++++++++++-------- .../include/clang/Frontend/CompilerInstance.h | 3 +++ .../clang/Frontend/CompilerInvocation.h | 1 + clang/lib/Frontend/ASTUnit.cpp | 15 ++++++++++-- clang/lib/Frontend/FrontendAction.cpp | 2 +- clang/test/Modules/compile-pcm-with-pic.cppm | 21 +++++++++++++++++ clang/tools/c-index-test/core_main.cpp | 2 +- clang/tools/libclang/CIndex.cpp | 2 +- 8 files changed, 54 insertions(+), 15 deletions(-) create mode 100644 clang/test/Modules/compile-pcm-with-pic.cppm diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h index 6af712afdcb6d..a2c1b25dd2247 100644 --- a/clang/include/clang/Frontend/ASTUnit.h +++ b/clang/include/clang/Frontend/ASTUnit.h @@ -691,16 +691,19 @@ class ASTUnit { /// lifetime is expected to extend past that of the returned ASTUnit. /// /// \returns - The initialized ASTUnit or null if the AST failed to load. - static std::unique_ptr LoadFromASTFile( - const std::string &Filename, const PCHContainerReader &PCHContainerRdr, - WhatToLoad ToLoad, IntrusiveRefCntPtr Diags, - const FileSystemOptions &FileSystemOpts, - std::shared_ptr HSOpts, bool OnlyLocalDecls = false, - CaptureDiagsKind CaptureDiagnostics = CaptureDiagsKind::None, - bool AllowASTWithCompilerErrors = false, - bool UserFilesAreVolatile = false, - IntrusiveRefCntPtr VFS = - llvm::vfs::getRealFileSystem()); + static std::unique_ptr + LoadFromASTFile(const std::string &Filename, + const PCHContainerReader &PCHContainerRdr, WhatToLoad ToLoad, + IntrusiveRefCntPtr Diags, + const FileSystemOptions &FileSystemOpts, + std::shared_ptr HSOpts, + std::shared_ptr LangOpts = nullptr, + bool OnlyLocalDecls = false, + CaptureDiagsKind CaptureDiagnostics = CaptureDiagsKind::None, + bool AllowASTWithCompilerErrors = false, + bool UserFilesAreVolatile = false, + IntrusiveRefCntPtr VFS = + llvm::vfs::getRealFileSystem()); private: /// Helper function for \c LoadFromCompilerInvocation() and diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index ac2f940769fbe..b97d0c636806a 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -311,6 +311,9 @@ class CompilerInstance : public ModuleLoader { LangOptions &getLangOpts() { return Invocation->getLangOpts(); } const LangOptions &getLangOpts() const { return Invocation->getLangOpts(); } + std::shared_ptr getLangOptsPtr() const { + return Invocation->getLangOptsPtr(); + } PreprocessorOptions &getPreprocessorOpts() { return Invocation->getPreprocessorOpts(); diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h index c6528779bde7b..8fc51e6ec03b6 100644 --- a/clang/include/clang/Frontend/CompilerInvocation.h +++ b/clang/include/clang/Frontend/CompilerInvocation.h @@ -271,6 +271,7 @@ class CompilerInvocation : public CompilerInvocationBase { std::shared_ptr getPreprocessorOptsPtr() { return PPOpts; } + std::shared_ptr getLangOptsPtr() { return LangOpts; } /// @} /// Create a compiler invocation from a list of input options. diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp index f09a01b5dd4af..3610a08831e79 100644 --- a/clang/lib/Frontend/ASTUnit.cpp +++ b/clang/lib/Frontend/ASTUnit.cpp @@ -540,7 +540,17 @@ class ASTInfoCollector : public ASTReaderListener { if (InitializedLanguage) return false; + // FIXME: We did similar things in ReadHeaderSearchOptions too. But such + // style is not scaling. Probably we need to invite some mechanism to + // handle such patterns generally. + auto PICLevel = LangOpt.PICLevel; + auto PIE = LangOpt.PIE; + LangOpt = LangOpts; + + LangOpt.PICLevel = PICLevel; + LangOpt.PIE = PIE; + InitializedLanguage = true; updated(); @@ -790,7 +800,8 @@ std::unique_ptr ASTUnit::LoadFromASTFile( const std::string &Filename, const PCHContainerReader &PCHContainerRdr, WhatToLoad ToLoad, IntrusiveRefCntPtr Diags, const FileSystemOptions &FileSystemOpts, - std::shared_ptr HSOpts, bool OnlyLocalDecls, + std::shared_ptr HSOpts, + std::shared_ptr LangOpts, bool OnlyLocalDecls, CaptureDiagsKind CaptureDiagnostics, bool AllowASTWithCompilerErrors, bool UserFilesAreVolatile, IntrusiveRefCntPtr VFS) { std::unique_ptr AST(new ASTUnit(true)); @@ -804,7 +815,7 @@ std::unique_ptr ASTUnit::LoadFromASTFile( ConfigureDiags(Diags, *AST, CaptureDiagnostics); - AST->LangOpts = std::make_shared(); + AST->LangOpts = LangOpts ? LangOpts : std::make_shared(); AST->OnlyLocalDecls = OnlyLocalDecls; AST->CaptureDiagnostics = CaptureDiagnostics; AST->Diagnostics = Diags; diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index eff785b99a09a..b9fd9b8897b7e 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -689,7 +689,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI, std::unique_ptr AST = ASTUnit::LoadFromASTFile( std::string(InputFile), CI.getPCHContainerReader(), ASTUnit::LoadEverything, Diags, CI.getFileSystemOpts(), - CI.getHeaderSearchOptsPtr()); + CI.getHeaderSearchOptsPtr(), CI.getLangOptsPtr()); if (!AST) return false; diff --git a/clang/test/Modules/compile-pcm-with-pic.cppm b/clang/test/Modules/compile-pcm-with-pic.cppm new file mode 100644 index 0000000000000..3d818dde0cd2f --- /dev/null +++ b/clang/test/Modules/compile-pcm-with-pic.cppm @@ -0,0 +1,21 @@ +// REQUIRES: x86-registered-target + +// RUN: rm -rf %t +// RUN: mkdir %t + +// RUN: %clang_cc1 -std=c++20 %s -pic-level 2 -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -std=c++20 %s -pic-level 2 -fmodule-output=%t/m.pcm -emit-llvm -o - \ +// RUN: | FileCheck %s +// +// RUN: %clang_cc1 -std=c++20 %s -emit-module-interface -o %t/m.pcm +// RUN: %clang_cc1 -std=c++20 %t/m.pcm -pic-level 2 -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -std=c++20 %t/m.pcm -emit-llvm -o - | FileCheck %s --check-prefix=NOPIC + +export module m; +export int x; +export int func() { + return x; +} + +// CHECK: ![[METADATA_NUM:[0-9]+]] = !{{{.*}}, !"PIC Level", i32 2} +// NOPIC-NOT: ![[METADATA_NUM:[0-9]+]] = !{{{.*}}, !"PIC Level", i32 2} diff --git a/clang/tools/c-index-test/core_main.cpp b/clang/tools/c-index-test/core_main.cpp index 56bf7c91acc7b..c552466c9a188 100644 --- a/clang/tools/c-index-test/core_main.cpp +++ b/clang/tools/c-index-test/core_main.cpp @@ -276,7 +276,7 @@ static bool printSourceSymbolsFromModule(StringRef modulePath, CompilerInstance::createDiagnostics(new DiagnosticOptions()); std::unique_ptr AU = ASTUnit::LoadFromASTFile( std::string(modulePath), *pchRdr, ASTUnit::LoadASTOnly, Diags, - FileSystemOpts, HSOpts, + FileSystemOpts, HSOpts, /*LangOpts=*/nullptr, /*OnlyLocalDecls=*/true, CaptureDiagsKind::None, /*AllowASTWithCompilerErrors=*/true, /*UserFilesAreVolatile=*/false); diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 4ded92cbe9aea..418b152ba4a13 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -3890,7 +3890,7 @@ enum CXErrorCode clang_createTranslationUnit2(CXIndex CIdx, std::unique_ptr AU = ASTUnit::LoadFromASTFile( ast_filename, CXXIdx->getPCHContainerOperations()->getRawReader(), ASTUnit::LoadEverything, Diags, FileSystemOpts, HSOpts, - CXXIdx->getOnlyLocalDecls(), CaptureDiagsKind::All, + /*LangOpts=*/nullptr, CXXIdx->getOnlyLocalDecls(), CaptureDiagsKind::All, /*AllowASTWithCompilerErrors=*/true, /*UserFilesAreVolatile=*/true); *out_TU = MakeCXTranslationUnit(CXXIdx, std::move(AU)); From 6e6bf9f81756ba6655b4eea8dc45469a47f89b39 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 22 Feb 2024 19:17:15 -0800 Subject: [PATCH 303/351] [WebAssembly] Disable multivalue emission temporarily (#82714) We plan to enable multivalue in the features section soon (#80923) for other reasons, such as the feature having been standardized for many years and other features being developed (e.g. EH) depending on it. This is separate from enabling Clang experimental multivalue ABI (`-Xclang -target-abi -Xclang experimental-mv`), but it turned out we generate some multivalue code in the backend as well if it is enabled in the features section. Given that our backend multivalue generation still has not been much used nor tested, and enabling the feature in the features section can be a separate decision from how much multialue (including none) we decide to generate for now, I'd like to temporarily disable the actual generation of multivalue in our backend. To do that, this adds an internal flag `-wasm-emit-multivalue` that defaults to false. All our existing multivalue tests can use this to test multivalue code. This flag can be removed later when we are confident the multivalue generation is well tested. --- .../WebAssembly/WebAssemblyISelLowering.cpp | 7 +++-- .../WebAssemblyMachineFunctionInfo.cpp | 5 +++- .../WebAssemblyRuntimeLibcallSignatures.cpp | 26 ++++++++++--------- .../WebAssembly/WebAssemblyTargetMachine.cpp | 9 +++++++ .../lower-em-ehsjlj-multi-return.ll | 4 +-- .../multivalue-dont-move-def-past-use.mir | 2 +- .../WebAssembly/multivalue-stackify.ll | 2 +- llvm/test/CodeGen/WebAssembly/multivalue.ll | 10 ++++--- .../CodeGen/WebAssembly/multivalue_libcall.ll | 2 +- 9 files changed, 43 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7c47790d1e351..36f067956e63a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -43,6 +43,8 @@ using namespace llvm; #define DEBUG_TYPE "wasm-lower" +extern cl::opt WasmEmitMultiValue; + WebAssemblyTargetLowering::WebAssemblyTargetLowering( const TargetMachine &TM, const WebAssemblySubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn( const SmallVectorImpl &Outs, LLVMContext & /*Context*/) const { // WebAssembly can only handle returning tuples with multivalue enabled - return Subtarget->hasMultivalue() || Outs.size() <= 1; + return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1; } SDValue WebAssemblyTargetLowering::LowerReturn( @@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn( const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { - assert((Subtarget->hasMultivalue() || Outs.size() <= 1) && + assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) || + Outs.size() <= 1) && "MVP WebAssembly can only return up to one value"); if (!callingConvSupported(CallConv)) fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 1e959111a4dbc..b969b8370a3e5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -22,6 +22,8 @@ #include "llvm/Target/TargetMachine.h" using namespace llvm; +extern cl::opt WasmEmitMultiValue; + WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor. MachineFunctionInfo *WebAssemblyFunctionInfo::clone( @@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); if (Results.size() > 1 && - !TM.getSubtarget(ContextFunc).hasMultivalue()) { + (!TM.getSubtarget(ContextFunc).hasMultivalue() || + !WasmEmitMultiValue)) { // WebAssembly can't lower returns of multiple values without demoting to // sret unless multivalue is enabled (see // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 3e2e029695ab6..2a84c90c89602 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -24,6 +24,8 @@ using namespace llvm; +extern cl::opt WasmEmitMultiValue; + namespace { enum RuntimeLibcallSignature { @@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(PtrTy); break; case i64_i64_func_f32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::F32); break; case i64_i64_func_f64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::F64); break; case i16_i16_func_i16_i16: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); } else { @@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i32_i32_func_i32_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I32); Rets.push_back(wasm::ValType::I32); } else { @@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i64_i64_func_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64_iPTR: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(PtrTy); break; case i64_i64_i64_i64_func_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); @@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i64_i64_i64_i64_i64_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I64); break; case i64_i64_func_i32: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { @@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, Params.push_back(wasm::ValType::I32); break; case i64_i64_func_i64: - if (Subtarget.hasMultivalue()) { + if (Subtarget.hasMultivalue() && WasmEmitMultiValue) { Rets.push_back(wasm::ValType::I64); Rets.push_back(wasm::ValType::I64); } else { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 42043a7b8680a..3120b6b67906e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -54,6 +54,15 @@ static cl::opt WasmDisableFixIrreducibleControlFlowPass( " irreducible control flow optimization pass"), cl::init(false)); +// A temporary option to control emission of multivalue until multivalue +// implementation is stable enough. We currently don't emit multivalue by +// default even if the feature section allows it. +// TODO Stabilize multivalue and delete this option +cl::opt + WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden, + cl::desc("WebAssembly: Emit multivalue in the backend"), + cl::init(false)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() { // Register the target. RegisterTargetMachine X( diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll index 4f33439db770d..daf46c6eef025 100644 --- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll +++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll @@ -1,5 +1,5 @@ -; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH -; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ +; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH +; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ ; Currently multivalue returning functions are not supported in Emscripten EH / ; SjLj. Make sure they error out. diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir index 4b4661b144667..4fadbd5f07e6d 100644 --- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir +++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll index 52a8c686824d3..f4f93ac2f30ce 100644 --- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll +++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Test functions have been generated by multivalue-stackify.py. -; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s ; Test that the multivalue stackification works diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll index 675009c8f3e54..846691e5ff0cd 100644 --- a/llvm/test/CodeGen/WebAssembly/multivalue.ll +++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll @@ -1,7 +1,8 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS -; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS +; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE ; Test that the multivalue calls, returns, function types, and block ; types work as expected. @@ -19,6 +20,7 @@ declare void @use_i64(i64) ; CHECK-NEXT: i32.const 42{{$}} ; CHECK-NEXT: i64.const 42{{$}} ; CHECK-NEXT: end_function{{$}} +; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64) define %pair @pair_const() { ret %pair { i32 42, i64 42 } } diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll index 47c5ae7b457dd..7bf37b59353ad 100644 --- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll +++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE +; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE ; Test libcall signatures when multivalue is enabled and disabled From ca09e08239008759f92f4aff39c7640da3e1bfa9 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Thu, 22 Feb 2024 19:41:15 -0800 Subject: [PATCH 304/351] [Symbolizer][WebAssembly] Use wasm-specific getSymbolSize (#82083) getSymbolSize was recently added to WasmObjectFile and has correct sizes for most symbol types. This makes llvm-symbolizer correctly symbolize addresses in the middle of the symbol. When reworking the test I also noticed that the DWARF info seems to be wrong for the first instruction in each function. I noted that in the test comments but didn't attempt to fix here. --- llvm/lib/Object/SymbolSize.cpp | 7 +++ llvm/test/tools/llvm-symbolizer/wasm-basic.s | 53 ++++++++++++++++---- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp index cb20feffb710b..635cd8373afbf 100644 --- a/llvm/lib/Object/SymbolSize.cpp +++ b/llvm/lib/Object/SymbolSize.cpp @@ -65,6 +65,13 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) { return Ret; } + if (const auto *E = dyn_cast(&O)) { + for (SymbolRef Sym : E->symbols()) { + Ret.push_back({Sym, E->getSymbolSize(Sym)}); + } + return Ret; + } + // Collect sorted symbol addresses. Include dummy addresses for the end // of each section. std::vector Addresses; diff --git a/llvm/test/tools/llvm-symbolizer/wasm-basic.s b/llvm/test/tools/llvm-symbolizer/wasm-basic.s index cc189abcfca80..1f425e5259316 100644 --- a/llvm/test/tools/llvm-symbolizer/wasm-basic.s +++ b/llvm/test/tools/llvm-symbolizer/wasm-basic.s @@ -1,24 +1,59 @@ # REQUIRES: webassembly-registered-target # RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj %s -o %t.o -g +# RUN: llvm-symbolizer --basenames --output-style=GNU -e %t.o 1 2 3 4 5 6 7 8 9 10 11 12 13 | FileCheck %s foo: .functype foo () -> () nop + return end_function bar: .functype bar (i32) -> (i32) local.get 0 + nop return end_function -# RUN: llvm-symbolizer -e %t.o 3 4 7 8 | FileCheck %s -## Byte 1 is the function length and 2 is the locals declaration. -## Currently no line corresponds to them. -## TODO: create a loc for .functype? -## Test 2 functions to ensure wasm's function-sections system works. -# CHECK: wasm-basic.s:6:0 -# CHECK: wasm-basic.s:7:0 -# CHECK: wasm-basic.s:11:0 -# CHECK: wasm-basic.s:11:0 +## Symbols start from (including) the function length and should cover all the +## way to the next symbol start. +## TODO: create a loc for .functype? It could go with the local declarations. + +## Byte 1 is the function length, has no loc but the symbol table considers it +## the start of the function +# CHECK: foo +# CHECK-NEXT: ??:0 +## Byte 2 is the local declaration, but for some reason DWARF is marking it as line 7. +## TODO: figure out why. +# CHECK-NEXT: foo +# CHECK-NEXT: wasm-basic.s:7 +## Byte 3 is actually the nop, line 7 +# CHECK-NEXT: foo +# CHECK-NEXT: wasm-basic.s:7 +## Byte 4 is the return, line 8 +# CHECK-NEXT: foo +# CHECK-NEXT: wasm-basic.s:8 +## Byte 5 is the end_function, line 9 +# CHECK-NEXT: foo +# CHECK-NEXT: wasm-basic.s:9 +## Byte 6 is bar's function length, symbol table considers it part of bar +# CHECK-NEXT: bar +# CHECK-NEXT: ??:0 +## Byte 7 bar's local declaration, but DWARF marks it as line 13, like above +# CHECK-NEXT: bar +# CHECK-NEXT: wasm-basic.s:13 +## Byte 8 and 9 are actually the local.get on line 13 +# CHECK-NEXT: bar +# CHECK-NEXT: wasm-basic.s:13 +# CHECK-NEXT: bar +# CHECK-NEXT: wasm-basic.s:13 +## Byte 10 is the nop +# CHECK-NEXT: bar +# CHECK-NEXT: wasm-basic.s:14 +## Byte b is the return +# CHECK-NEXT: bar +# CHECK-NEXT: wasm-basic.s:15 +## Byte c is end_function +# CHECK-NEXT: bar +# CHECK-NEXT: wasm-basic.s:16 From de41eae41f0dc2a844b439e0246e29c1bcbb2d03 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 22 Feb 2024 20:18:52 -0800 Subject: [PATCH 305/351] [SelectionDAG][RISCV] Use FP type for legality query for LRINT/LLRINT in LegalizeVectorOps. (#82728) This matches how LRINT/LLRINT is queried for scalar types in LegalizeDAG. It's confusing if they do different things since a "Legal" vector LRINT/LLRINT would get through to LegalizeDAG which would then consider it illegal. This doesn't happen currently because RISC-V uses Custom. --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 4 ++-- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2a7aaf88847ea..6074498d9144f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -404,8 +404,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FCEIL: case ISD::FTRUNC: case ISD::FRINT: - case ISD::LRINT: - case ISD::LLRINT: case ISD::FNEARBYINT: case ISD::FROUND: case ISD::FROUNDEVEN: @@ -455,6 +453,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { Node->getValueType(0), Scale); break; } + case ISD::LRINT: + case ISD::LLRINT: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::VECREDUCE_ADD: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5c67aaf678566..04d5e60500ce6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -830,7 +830,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, VT, Custom); setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT, Custom); - setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal); @@ -956,6 +955,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // between vXf16 and vXf64 must be lowered as sequences which convert via // vXf32. setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); + setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); // Custom-lower insert/extract operations to simplify patterns. setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT, Custom); From 2d50703ddd4fcf7826e4b62cba38e3151314ca60 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 23 Feb 2024 12:46:37 +0800 Subject: [PATCH 306/351] [RISCV] Use RISCVSubtarget::getRealVLen() in more places. NFC Catching a couple of more places where we can use the new query added in 8603a7b2. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 31 +++++++++------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 04d5e60500ce6..7540b22d13b7f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3848,11 +3848,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // If we're compiling for an exact VLEN value, we can split our work per // register in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); - if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) { + if (const auto VLen = Subtarget.getRealVLen(); + VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) { MVT ElemVT = VT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); @@ -4763,9 +4762,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, // If we don't know exact data layout, not much we can do. If this // is already m1 or smaller, no point in splitting further. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); - if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen) + const auto VLen = Subtarget.getRealVLen(); + if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen) return SDValue(); // Avoid picking up bitrotate patterns which we have a linear-in-lmul @@ -4776,7 +4774,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, return SDValue(); MVT ElemVT = VT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); unsigned VRegsPerSrc = NumElts / ElemsPerVReg; SmallVector>> @@ -8328,15 +8326,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, // constant index, we can always perform the extract in m1 (or // smaller) as we can determine the register corresponding to // the index in the register group. - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + const auto VLen = Subtarget.getRealVLen(); if (auto *IdxC = dyn_cast(Idx); - IdxC && MinVLen == MaxVLen && - VecVT.getSizeInBits().getKnownMinValue() > MinVLen) { + IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) { MVT M1VT = getLMUL1VT(ContainerVT); unsigned OrigIdx = IdxC->getZExtValue(); EVT ElemVT = VecVT.getVectorElementType(); - unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits(); + unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); unsigned RemIdx = OrigIdx % ElemsPerVReg; unsigned SubRegIdx = OrigIdx / ElemsPerVReg; unsigned ExtractIdx = @@ -9797,15 +9793,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, if (OrigIdx == 0) return Op; - const unsigned MinVLen = Subtarget.getRealMinVLen(); - const unsigned MaxVLen = Subtarget.getRealMaxVLen(); + const auto VLen = Subtarget.getRealVLen(); // If the subvector vector is a fixed-length type and we don't know VLEN // exactly, we cannot use subregister manipulation to simplify the codegen; we // don't know which register of a LMUL group contains the specific subvector // as we only know the minimum register size. Therefore we must slide the // vector group down the full amount. - if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) { + if (SubVecVT.isFixedLengthVector() && !VLen) { MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { ContainerVT = getContainerForFixedLengthVector(VecVT); @@ -9852,8 +9847,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if // we have a fixed length subvector, we need to adjust the index by 1/vscale. if (SubVecVT.isFixedLengthVector()) { - assert(MinVLen == MaxVLen); - unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock; + assert(VLen); + unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock; auto Decompose = RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI); From 0d72fe9777e7c131dfb50c172b944d64437e2ece Mon Sep 17 00:00:00 2001 From: shkoo Date: Thu, 22 Feb 2024 21:27:01 -0800 Subject: [PATCH 307/351] [mlir] Fix FunctionOpInterface extraSharedClassDeclaration to be fully namespace qualified (#82682) `extraSharedClassDeclaration` of `FunctionOpInterface` can be inherited by other `OpInterfaces` into foreign namespaces, thus types must be fully qualified to prevent compiler errors, for example: def MyFunc : OpInterface<"MyFunc", [FunctionOpInterface]> { let cppNamespace = "::MyNamespace"; } --- .../mlir/Interfaces/FunctionInterfaces.td | 226 +++++++++--------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/mlir/include/mlir/Interfaces/FunctionInterfaces.td b/mlir/include/mlir/Interfaces/FunctionInterfaces.td index 98e002565cf19..970a781c998b9 100644 --- a/mlir/include/mlir/Interfaces/FunctionInterfaces.td +++ b/mlir/include/mlir/Interfaces/FunctionInterfaces.td @@ -147,12 +147,12 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ }]; let extraSharedClassDeclaration = [{ /// Block list iterator types. - using BlockListType = Region::BlockListType; + using BlockListType = ::mlir::Region::BlockListType; using iterator = BlockListType::iterator; using reverse_iterator = BlockListType::reverse_iterator; /// Block argument iterator types. - using BlockArgListType = Region::BlockArgListType; + using BlockArgListType = ::mlir::Region::BlockArgListType; using args_iterator = BlockArgListType::iterator; //===------------------------------------------------------------------===// @@ -163,7 +163,7 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ bool isExternal() { return empty(); } /// Return the region containing the body of this function. - Region &getFunctionBody() { return $_op->getRegion(0); } + ::mlir::Region &getFunctionBody() { return $_op->getRegion(0); } /// Delete all blocks from this function. void eraseBody() { @@ -183,39 +183,39 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ bool empty() { return getFunctionBody().empty(); } /// Push a new block to the back of the body region. - void push_back(Block *block) { getFunctionBody().push_back(block); } + void push_back(::mlir::Block *block) { getFunctionBody().push_back(block); } /// Push a new block to the front of the body region. - void push_front(Block *block) { getFunctionBody().push_front(block); } + void push_front(::mlir::Block *block) { getFunctionBody().push_front(block); } /// Return the last block in the body region. - Block &back() { return getFunctionBody().back(); } + ::mlir::Block &back() { return getFunctionBody().back(); } /// Return the first block in the body region. - Block &front() { return getFunctionBody().front(); } + ::mlir::Block &front() { return getFunctionBody().front(); } /// Add an entry block to an empty function, and set up the block arguments /// to match the signature of the function. The newly inserted entry block /// is returned. - Block *addEntryBlock() { + ::mlir::Block *addEntryBlock() { assert(empty() && "function already has an entry block"); - Block *entry = new Block(); + ::mlir::Block *entry = new ::mlir::Block(); push_back(entry); // FIXME: Allow for passing in locations for these arguments instead of using // the operations location. - ArrayRef inputTypes = $_op.getArgumentTypes(); - SmallVector locations(inputTypes.size(), - $_op.getOperation()->getLoc()); + ::llvm::ArrayRef<::mlir::Type> inputTypes = $_op.getArgumentTypes(); + ::llvm::SmallVector<::mlir::Location> locations(inputTypes.size(), + $_op.getOperation()->getLoc()); entry->addArguments(inputTypes, locations); return entry; } /// Add a normal block to the end of the function's block list. The function /// should at least already have an entry block. - Block *addBlock() { + ::mlir::Block *addBlock() { assert(!empty() && "function should at least have an entry block"); - push_back(new Block()); + push_back(new ::mlir::Block()); return &back(); } @@ -230,8 +230,8 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ /// - the argument/result attributes may need an update: if the new type /// has less parameters we drop the extra attributes, if there are more /// parameters they won't have any attributes. - void setType(Type newType) { - function_interface_impl::setFunctionType($_op, newType); + void setType(::mlir::Type newType) { + ::mlir::function_interface_impl::setFunctionType($_op, newType); } //===------------------------------------------------------------------===// @@ -245,7 +245,7 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ unsigned getNumResults() { return $_op.getResultTypes().size(); } /// Returns the entry block function argument at the given index. - BlockArgument getArgument(unsigned idx) { + ::mlir::BlockArgument getArgument(unsigned idx) { return getFunctionBody().getArgument(idx); } @@ -256,8 +256,8 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ /// Insert a single argument of type `argType` with attributes `argAttrs` and /// location `argLoc` at `argIndex`. - void insertArgument(unsigned argIndex, Type argType, DictionaryAttr argAttrs, - Location argLoc) { + void insertArgument(unsigned argIndex, ::mlir::Type argType, ::mlir::DictionaryAttr argAttrs, + ::mlir::Location argLoc) { insertArguments({argIndex}, {argType}, {argAttrs}, {argLoc}); } @@ -265,20 +265,20 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ /// listed indices. `argIndices` must be sorted. Arguments are inserted in the /// order they are listed, such that arguments with identical index will /// appear in the same order that they were listed here. - void insertArguments(ArrayRef argIndices, TypeRange argTypes, - ArrayRef argAttrs, - ArrayRef argLocs) { + void insertArguments(::llvm::ArrayRef argIndices, ::mlir::TypeRange argTypes, + ::llvm::ArrayRef<::mlir::DictionaryAttr> argAttrs, + ::llvm::ArrayRef<::mlir::Location> argLocs) { unsigned originalNumArgs = $_op.getNumArguments(); - Type newType = $_op.getTypeWithArgsAndResults( + ::mlir::Type newType = $_op.getTypeWithArgsAndResults( argIndices, argTypes, /*resultIndices=*/{}, /*resultTypes=*/{}); - function_interface_impl::insertFunctionArguments( + ::mlir::function_interface_impl::insertFunctionArguments( $_op, argIndices, argTypes, argAttrs, argLocs, originalNumArgs, newType); } /// Insert a single result of type `resultType` at `resultIndex`. - void insertResult(unsigned resultIndex, Type resultType, - DictionaryAttr resultAttrs) { + void insertResult(unsigned resultIndex, ::mlir::Type resultType, + ::mlir::DictionaryAttr resultAttrs) { insertResults({resultIndex}, {resultType}, {resultAttrs}); } @@ -286,41 +286,41 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ /// `resultIndices` must be sorted. Results are inserted in the order they are /// listed, such that results with identical index will appear in the same /// order that they were listed here. - void insertResults(ArrayRef resultIndices, TypeRange resultTypes, - ArrayRef resultAttrs) { + void insertResults(::llvm::ArrayRef resultIndices, ::mlir::TypeRange resultTypes, + ::llvm::ArrayRef<::mlir::DictionaryAttr> resultAttrs) { unsigned originalNumResults = $_op.getNumResults(); - Type newType = $_op.getTypeWithArgsAndResults( + ::mlir::Type newType = $_op.getTypeWithArgsAndResults( /*argIndices=*/{}, /*argTypes=*/{}, resultIndices, resultTypes); - function_interface_impl::insertFunctionResults( + ::mlir::function_interface_impl::insertFunctionResults( $_op, resultIndices, resultTypes, resultAttrs, originalNumResults, newType); } /// Erase a single argument at `argIndex`. void eraseArgument(unsigned argIndex) { - BitVector argsToErase($_op.getNumArguments()); + ::llvm::BitVector argsToErase($_op.getNumArguments()); argsToErase.set(argIndex); eraseArguments(argsToErase); } /// Erases the arguments listed in `argIndices`. - void eraseArguments(const BitVector &argIndices) { - Type newType = $_op.getTypeWithoutArgs(argIndices); - function_interface_impl::eraseFunctionArguments( + void eraseArguments(const ::llvm::BitVector &argIndices) { + ::mlir::Type newType = $_op.getTypeWithoutArgs(argIndices); + ::mlir::function_interface_impl::eraseFunctionArguments( $_op, argIndices, newType); } /// Erase a single result at `resultIndex`. void eraseResult(unsigned resultIndex) { - BitVector resultsToErase($_op.getNumResults()); + ::llvm::BitVector resultsToErase($_op.getNumResults()); resultsToErase.set(resultIndex); eraseResults(resultsToErase); } /// Erases the results listed in `resultIndices`. - void eraseResults(const BitVector &resultIndices) { - Type newType = $_op.getTypeWithoutResults(resultIndices); - function_interface_impl::eraseFunctionResults( + void eraseResults(const ::llvm::BitVector &resultIndices) { + ::mlir::Type newType = $_op.getTypeWithoutResults(resultIndices); + ::mlir::function_interface_impl::eraseFunctionResults( $_op, resultIndices, newType); } @@ -328,13 +328,13 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ /// results inserted. This is used to update the function's signature in /// the `insertArguments` and `insertResults` methods. The arrays must be /// sorted by increasing index. - Type getTypeWithArgsAndResults( - ArrayRef argIndices, TypeRange argTypes, - ArrayRef resultIndices, TypeRange resultTypes) { - SmallVector argStorage, resultStorage; - TypeRange newArgTypes = insertTypesInto( + ::mlir::Type getTypeWithArgsAndResults( + ::llvm::ArrayRef argIndices, ::mlir::TypeRange argTypes, + ::llvm::ArrayRef resultIndices, ::mlir::TypeRange resultTypes) { + ::llvm::SmallVector<::mlir::Type> argStorage, resultStorage; + ::mlir::TypeRange newArgTypes = insertTypesInto( $_op.getArgumentTypes(), argIndices, argTypes, argStorage); - TypeRange newResultTypes = insertTypesInto( + ::mlir::TypeRange newResultTypes = insertTypesInto( $_op.getResultTypes(), resultIndices, resultTypes, resultStorage); return $_op.cloneTypeWith(newArgTypes, newResultTypes); } @@ -342,24 +342,24 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ /// Return the type of this function without the specified arguments and /// results. This is used to update the function's signature in the /// `eraseArguments` and `eraseResults` methods. - Type getTypeWithoutArgsAndResults( - const BitVector &argIndices, const BitVector &resultIndices) { - SmallVector argStorage, resultStorage; - TypeRange newArgTypes = filterTypesOut( + ::mlir::Type getTypeWithoutArgsAndResults( + const ::llvm::BitVector &argIndices, const ::llvm::BitVector &resultIndices) { + ::llvm::SmallVector<::mlir::Type> argStorage, resultStorage; + ::mlir::TypeRange newArgTypes = filterTypesOut( $_op.getArgumentTypes(), argIndices, argStorage); - TypeRange newResultTypes = filterTypesOut( + ::mlir::TypeRange newResultTypes = filterTypesOut( $_op.getResultTypes(), resultIndices, resultStorage); return $_op.cloneTypeWith(newArgTypes, newResultTypes); } - Type getTypeWithoutArgs(const BitVector &argIndices) { - SmallVector argStorage; - TypeRange newArgTypes = filterTypesOut( + ::mlir::Type getTypeWithoutArgs(const ::llvm::BitVector &argIndices) { + ::llvm::SmallVector<::mlir::Type> argStorage; + ::mlir::TypeRange newArgTypes = filterTypesOut( $_op.getArgumentTypes(), argIndices, argStorage); return $_op.cloneTypeWith(newArgTypes, $_op.getResultTypes()); } - Type getTypeWithoutResults(const BitVector &resultIndices) { - SmallVector resultStorage; - TypeRange newResultTypes = filterTypesOut( + ::mlir::Type getTypeWithoutResults(const ::llvm::BitVector &resultIndices) { + ::llvm::SmallVector<::mlir::Type> resultStorage; + ::mlir::TypeRange newResultTypes = filterTypesOut( $_op.getResultTypes(), resultIndices, resultStorage); return $_op.cloneTypeWith($_op.getArgumentTypes(), newResultTypes); } @@ -369,88 +369,88 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ //===------------------------------------------------------------------===// /// Return all of the attributes for the argument at 'index'. - ArrayRef getArgAttrs(unsigned index) { - return function_interface_impl::getArgAttrs($_op, index); + ::llvm::ArrayRef<::mlir::NamedAttribute> getArgAttrs(unsigned index) { + return ::mlir::function_interface_impl::getArgAttrs($_op, index); } /// Return an ArrayAttr containing all argument attribute dictionaries of /// this function, or nullptr if no arguments have attributes. - ArrayAttr getAllArgAttrs() { return $_op.getArgAttrsAttr(); } + ::mlir::ArrayAttr getAllArgAttrs() { return $_op.getArgAttrsAttr(); } /// Return all argument attributes of this function. - void getAllArgAttrs(SmallVectorImpl &result) { - if (ArrayAttr argAttrs = getAllArgAttrs()) { - auto argAttrRange = argAttrs.template getAsRange(); + void getAllArgAttrs(::llvm::SmallVectorImpl<::mlir::DictionaryAttr> &result) { + if (::mlir::ArrayAttr argAttrs = getAllArgAttrs()) { + auto argAttrRange = argAttrs.template getAsRange<::mlir::DictionaryAttr>(); result.append(argAttrRange.begin(), argAttrRange.end()); } else { result.append($_op.getNumArguments(), - DictionaryAttr::get(this->getOperation()->getContext())); + ::mlir::DictionaryAttr::get(this->getOperation()->getContext())); } } /// Return the specified attribute, if present, for the argument at 'index', /// null otherwise. - Attribute getArgAttr(unsigned index, StringAttr name) { + ::mlir::Attribute getArgAttr(unsigned index, ::mlir::StringAttr name) { auto argDict = getArgAttrDict(index); return argDict ? argDict.get(name) : nullptr; } - Attribute getArgAttr(unsigned index, StringRef name) { + ::mlir::Attribute getArgAttr(unsigned index, ::llvm::StringRef name) { auto argDict = getArgAttrDict(index); return argDict ? argDict.get(name) : nullptr; } template - AttrClass getArgAttrOfType(unsigned index, StringAttr name) { + AttrClass getArgAttrOfType(unsigned index, ::mlir::StringAttr name) { return ::llvm::dyn_cast_or_null(getArgAttr(index, name)); } template - AttrClass getArgAttrOfType(unsigned index, StringRef name) { + AttrClass getArgAttrOfType(unsigned index, ::llvm::StringRef name) { return ::llvm::dyn_cast_or_null(getArgAttr(index, name)); } /// Set the attributes held by the argument at 'index'. - void setArgAttrs(unsigned index, ArrayRef attributes) { - function_interface_impl::setArgAttrs($_op, index, attributes); + void setArgAttrs(unsigned index, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) { + ::mlir::function_interface_impl::setArgAttrs($_op, index, attributes); } /// Set the attributes held by the argument at 'index'. `attributes` may be /// null, in which case any existing argument attributes are removed. - void setArgAttrs(unsigned index, DictionaryAttr attributes) { - function_interface_impl::setArgAttrs($_op, index, attributes); + void setArgAttrs(unsigned index, ::mlir::DictionaryAttr attributes) { + ::mlir::function_interface_impl::setArgAttrs($_op, index, attributes); } - void setAllArgAttrs(ArrayRef attributes) { + void setAllArgAttrs(::llvm::ArrayRef<::mlir::DictionaryAttr> attributes) { assert(attributes.size() == $_op.getNumArguments()); - function_interface_impl::setAllArgAttrDicts($_op, attributes); + ::mlir::function_interface_impl::setAllArgAttrDicts($_op, attributes); } - void setAllArgAttrs(ArrayRef attributes) { + void setAllArgAttrs(::llvm::ArrayRef<::mlir::Attribute> attributes) { assert(attributes.size() == $_op.getNumArguments()); - function_interface_impl::setAllArgAttrDicts($_op, attributes); + ::mlir::function_interface_impl::setAllArgAttrDicts($_op, attributes); } - void setAllArgAttrs(ArrayAttr attributes) { + void setAllArgAttrs(::mlir::ArrayAttr attributes) { assert(attributes.size() == $_op.getNumArguments()); $_op.setArgAttrsAttr(attributes); } /// If the an attribute exists with the specified name, change it to the new /// value. Otherwise, add a new attribute with the specified name/value. - void setArgAttr(unsigned index, StringAttr name, Attribute value) { - function_interface_impl::setArgAttr($_op, index, name, value); + void setArgAttr(unsigned index, ::mlir::StringAttr name, ::mlir::Attribute value) { + ::mlir::function_interface_impl::setArgAttr($_op, index, name, value); } - void setArgAttr(unsigned index, StringRef name, Attribute value) { + void setArgAttr(unsigned index, ::llvm::StringRef name, ::mlir::Attribute value) { setArgAttr(index, - StringAttr::get(this->getOperation()->getContext(), name), + ::mlir::StringAttr::get(this->getOperation()->getContext(), name), value); } /// Remove the attribute 'name' from the argument at 'index'. Return the /// attribute that was erased, or nullptr if there was no attribute with /// such name. - Attribute removeArgAttr(unsigned index, StringAttr name) { - return function_interface_impl::removeArgAttr($_op, index, name); + ::mlir::Attribute removeArgAttr(unsigned index, ::mlir::StringAttr name) { + return ::mlir::function_interface_impl::removeArgAttr($_op, index, name); } - Attribute removeArgAttr(unsigned index, StringRef name) { + ::mlir::Attribute removeArgAttr(unsigned index, ::llvm::StringRef name) { return removeArgAttr( - index, StringAttr::get(this->getOperation()->getContext(), name)); + index, ::mlir::StringAttr::get(this->getOperation()->getContext(), name)); } //===------------------------------------------------------------------===// @@ -458,102 +458,102 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [ //===------------------------------------------------------------------===// /// Return all of the attributes for the result at 'index'. - ArrayRef getResultAttrs(unsigned index) { - return function_interface_impl::getResultAttrs($_op, index); + ::llvm::ArrayRef<::mlir::NamedAttribute> getResultAttrs(unsigned index) { + return ::mlir::function_interface_impl::getResultAttrs($_op, index); } /// Return an ArrayAttr containing all result attribute dictionaries of this /// function, or nullptr if no result have attributes. - ArrayAttr getAllResultAttrs() { return $_op.getResAttrsAttr(); } + ::mlir::ArrayAttr getAllResultAttrs() { return $_op.getResAttrsAttr(); } /// Return all result attributes of this function. - void getAllResultAttrs(SmallVectorImpl &result) { - if (ArrayAttr argAttrs = getAllResultAttrs()) { - auto argAttrRange = argAttrs.template getAsRange(); + void getAllResultAttrs(::llvm::SmallVectorImpl<::mlir::DictionaryAttr> &result) { + if (::mlir::ArrayAttr argAttrs = getAllResultAttrs()) { + auto argAttrRange = argAttrs.template getAsRange<::mlir::DictionaryAttr>(); result.append(argAttrRange.begin(), argAttrRange.end()); } else { result.append($_op.getNumResults(), - DictionaryAttr::get(this->getOperation()->getContext())); + ::mlir::DictionaryAttr::get(this->getOperation()->getContext())); } } /// Return the specified attribute, if present, for the result at 'index', /// null otherwise. - Attribute getResultAttr(unsigned index, StringAttr name) { + ::mlir::Attribute getResultAttr(unsigned index, ::mlir::StringAttr name) { auto argDict = getResultAttrDict(index); return argDict ? argDict.get(name) : nullptr; } - Attribute getResultAttr(unsigned index, StringRef name) { + ::mlir::Attribute getResultAttr(unsigned index, ::llvm::StringRef name) { auto argDict = getResultAttrDict(index); return argDict ? argDict.get(name) : nullptr; } template - AttrClass getResultAttrOfType(unsigned index, StringAttr name) { + AttrClass getResultAttrOfType(unsigned index, ::mlir::StringAttr name) { return ::llvm::dyn_cast_or_null(getResultAttr(index, name)); } template - AttrClass getResultAttrOfType(unsigned index, StringRef name) { + AttrClass getResultAttrOfType(unsigned index, ::llvm::StringRef name) { return ::llvm::dyn_cast_or_null(getResultAttr(index, name)); } /// Set the attributes held by the result at 'index'. - void setResultAttrs(unsigned index, ArrayRef attributes) { - function_interface_impl::setResultAttrs($_op, index, attributes); + void setResultAttrs(unsigned index, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) { + ::mlir::function_interface_impl::setResultAttrs($_op, index, attributes); } /// Set the attributes held by the result at 'index'. `attributes` may be /// null, in which case any existing argument attributes are removed. - void setResultAttrs(unsigned index, DictionaryAttr attributes) { - function_interface_impl::setResultAttrs($_op, index, attributes); + void setResultAttrs(unsigned index, ::mlir::DictionaryAttr attributes) { + ::mlir::function_interface_impl::setResultAttrs($_op, index, attributes); } - void setAllResultAttrs(ArrayRef attributes) { + void setAllResultAttrs(::llvm::ArrayRef<::mlir::DictionaryAttr> attributes) { assert(attributes.size() == $_op.getNumResults()); - function_interface_impl::setAllResultAttrDicts( + ::mlir::function_interface_impl::setAllResultAttrDicts( $_op, attributes); } - void setAllResultAttrs(ArrayRef attributes) { + void setAllResultAttrs(::llvm::ArrayRef<::mlir::Attribute> attributes) { assert(attributes.size() == $_op.getNumResults()); - function_interface_impl::setAllResultAttrDicts( + ::mlir::function_interface_impl::setAllResultAttrDicts( $_op, attributes); } - void setAllResultAttrs(ArrayAttr attributes) { + void setAllResultAttrs(::mlir::ArrayAttr attributes) { assert(attributes.size() == $_op.getNumResults()); $_op.setResAttrsAttr(attributes); } /// If the an attribute exists with the specified name, change it to the new /// value. Otherwise, add a new attribute with the specified name/value. - void setResultAttr(unsigned index, StringAttr name, Attribute value) { - function_interface_impl::setResultAttr($_op, index, name, value); + void setResultAttr(unsigned index, ::mlir::StringAttr name, ::mlir::Attribute value) { + ::mlir::function_interface_impl::setResultAttr($_op, index, name, value); } - void setResultAttr(unsigned index, StringRef name, Attribute value) { + void setResultAttr(unsigned index, ::llvm::StringRef name, ::mlir::Attribute value) { setResultAttr(index, - StringAttr::get(this->getOperation()->getContext(), name), + ::mlir::StringAttr::get(this->getOperation()->getContext(), name), value); } /// Remove the attribute 'name' from the result at 'index'. Return the /// attribute that was erased, or nullptr if there was no attribute with /// such name. - Attribute removeResultAttr(unsigned index, StringAttr name) { - return function_interface_impl::removeResultAttr($_op, index, name); + ::mlir::Attribute removeResultAttr(unsigned index, ::mlir::StringAttr name) { + return ::mlir::function_interface_impl::removeResultAttr($_op, index, name); } /// Returns the dictionary attribute corresponding to the argument at /// 'index'. If there are no argument attributes at 'index', a null /// attribute is returned. - DictionaryAttr getArgAttrDict(unsigned index) { + ::mlir::DictionaryAttr getArgAttrDict(unsigned index) { assert(index < $_op.getNumArguments() && "invalid argument number"); - return function_interface_impl::getArgAttrDict($_op, index); + return ::mlir::function_interface_impl::getArgAttrDict($_op, index); } /// Returns the dictionary attribute corresponding to the result at 'index'. /// If there are no result attributes at 'index', a null attribute is /// returned. - DictionaryAttr getResultAttrDict(unsigned index) { + ::mlir::DictionaryAttr getResultAttrDict(unsigned index) { assert(index < $_op.getNumResults() && "invalid result number"); - return function_interface_impl::getResultAttrDict($_op, index); + return ::mlir::function_interface_impl::getResultAttrDict($_op, index); } }]; From afd469023aad10786eaea3d444047a558ad8d5c1 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 22 Feb 2024 21:48:49 -0800 Subject: [PATCH 308/351] [lldb] Fix term-width setting (#82736) I noticed that the term-width setting would always report its default value (80) despite the driver correctly setting the value with SBDebugger::SetTerminalWidth. ``` (lldb) settings show term-width term-width (int) = 80 ``` The issue is that the setting was defined as a SInt64 instead of a UInt64 while the getter returned an unsigned value. There's no reason the terminal width should be a signed value. My best guess it that it was using SInt64 because UInt64 didn't support min and max values. I fixed that and correct the type and now lldb reports the correct terminal width: ``` (lldb) settings show term-width term-width (unsigned) = 189 ``` rdar://123488999 --- .../lldb/Interpreter/OptionValueSInt64.h | 4 +-- .../lldb/Interpreter/OptionValueUInt64.h | 26 +++++++++++++++++-- lldb/source/Core/CoreProperties.td | 2 +- lldb/source/Core/Debugger.cpp | 4 +-- lldb/source/Interpreter/OptionValueUInt64.cpp | 13 +++++++--- .../API/commands/settings/TestSettings.py | 15 ++++++++--- .../TestTrimmedProgressReporting.py | 3 ++- 7 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lldb/include/lldb/Interpreter/OptionValueSInt64.h b/lldb/include/lldb/Interpreter/OptionValueSInt64.h index 5efae627758ac..3cf41d38c0ef0 100644 --- a/lldb/include/lldb/Interpreter/OptionValueSInt64.h +++ b/lldb/include/lldb/Interpreter/OptionValueSInt64.h @@ -86,8 +86,8 @@ class OptionValueSInt64 : public Cloneable { protected: int64_t m_current_value = 0; int64_t m_default_value = 0; - int64_t m_min_value = INT64_MIN; - int64_t m_max_value = INT64_MAX; + int64_t m_min_value = std::numeric_limits::min(); + int64_t m_max_value = std::numeric_limits::max(); }; } // namespace lldb_private diff --git a/lldb/include/lldb/Interpreter/OptionValueUInt64.h b/lldb/include/lldb/Interpreter/OptionValueUInt64.h index 30c27bf73d99c..07076075790c6 100644 --- a/lldb/include/lldb/Interpreter/OptionValueUInt64.h +++ b/lldb/include/lldb/Interpreter/OptionValueUInt64.h @@ -64,13 +64,35 @@ class OptionValueUInt64 : public Cloneable { uint64_t GetDefaultValue() const { return m_default_value; } - void SetCurrentValue(uint64_t value) { m_current_value = value; } + bool SetCurrentValue(uint64_t value) { + if (value >= m_min_value && value <= m_max_value) { + m_current_value = value; + return true; + } + return false; + } + + bool SetDefaultValue(uint64_t value) { + if (value >= m_min_value && value <= m_max_value) { + m_default_value = value; + return true; + } + return false; + } + + void SetMinimumValue(int64_t v) { m_min_value = v; } + + uint64_t GetMinimumValue() const { return m_min_value; } + + void SetMaximumValue(int64_t v) { m_max_value = v; } - void SetDefaultValue(uint64_t value) { m_default_value = value; } + uint64_t GetMaximumValue() const { return m_max_value; } protected: uint64_t m_current_value = 0; uint64_t m_default_value = 0; + uint64_t m_min_value = std::numeric_limits::min(); + uint64_t m_max_value = std::numeric_limits::max(); }; } // namespace lldb_private diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index 4cfff805688c5..a6cb951187a04 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -132,7 +132,7 @@ let Definition = "debugger" in { Global, DefaultStringValue<"${ansi.normal}">, Desc<"When displaying the line marker in a color-enabled terminal, use the ANSI terminal code specified in this format immediately after the line to be marked.">; - def TerminalWidth: Property<"term-width", "SInt64">, + def TerminalWidth: Property<"term-width", "UInt64">, Global, DefaultUnsignedValue<80>, Desc<"The maximum number of columns to use for displaying text.">; diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 97311b4716ac2..bb81110ae35a5 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -886,8 +886,8 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) } assert(m_dummy_target_sp.get() && "Couldn't construct dummy target?"); - OptionValueSInt64 *term_width = - m_collection_sp->GetPropertyAtIndexAsOptionValueSInt64( + OptionValueUInt64 *term_width = + m_collection_sp->GetPropertyAtIndexAsOptionValueUInt64( ePropertyTerminalWidth); term_width->SetMinimumValue(10); term_width->SetMaximumValue(1024); diff --git a/lldb/source/Interpreter/OptionValueUInt64.cpp b/lldb/source/Interpreter/OptionValueUInt64.cpp index 1999c63d11aff..2e69c164e32ac 100644 --- a/lldb/source/Interpreter/OptionValueUInt64.cpp +++ b/lldb/source/Interpreter/OptionValueUInt64.cpp @@ -47,9 +47,16 @@ Status OptionValueUInt64::SetValueFromString(llvm::StringRef value_ref, llvm::StringRef value_trimmed = value_ref.trim(); uint64_t value; if (llvm::to_integer(value_trimmed, value)) { - m_value_was_set = true; - m_current_value = value; - NotifyValueChanged(); + if (value >= m_min_value && value <= m_max_value) { + m_value_was_set = true; + m_current_value = value; + NotifyValueChanged(); + } else { + error.SetErrorStringWithFormat( + "%" PRIu64 " is out of range, valid values must be between %" PRIu64 + " and %" PRIu64 ".", + value, m_min_value, m_max_value); + } } else { error.SetErrorStringWithFormat("invalid uint64_t string value: '%s'", value_ref.str().c_str()); diff --git a/lldb/test/API/commands/settings/TestSettings.py b/lldb/test/API/commands/settings/TestSettings.py index a2d845493d1df..104a9f09788c3 100644 --- a/lldb/test/API/commands/settings/TestSettings.py +++ b/lldb/test/API/commands/settings/TestSettings.py @@ -2,7 +2,6 @@ Test lldb settings command. """ - import json import os import re @@ -151,14 +150,22 @@ def test_set_term_width(self): self.expect( "settings show term-width", SETTING_MSG("term-width"), - startstr="term-width (int) = 70", + startstr="term-width (unsigned) = 70", ) # The overall display should also reflect the new setting. self.expect( "settings show", SETTING_MSG("term-width"), - substrs=["term-width (int) = 70"], + substrs=["term-width (unsigned) = 70"], + ) + + self.dbg.SetTerminalWidth(60) + + self.expect( + "settings show", + SETTING_MSG("term-width"), + substrs=["term-width (unsigned) = 60"], ) # rdar://problem/10712130 @@ -593,7 +600,7 @@ def test_settings_with_trailing_whitespace(self): self.expect( "settings show term-width", SETTING_MSG("term-width"), - startstr="term-width (int) = 60", + startstr="term-width (unsigned) = 60", ) self.runCmd("settings clear term-width", check=False) # string diff --git a/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py index 357999b6f5619..ee35dbd23b3db 100644 --- a/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py +++ b/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py @@ -24,7 +24,8 @@ def do_test(self, term_width, pattern_list): ) self.expect("set set term-width " + str(term_width)) self.expect( - "set show term-width", substrs=["term-width (int) = " + str(term_width)] + "set show term-width", + substrs=["term-width (unsigned) = " + str(term_width)], ) self.child.send("file " + self.getBuildArtifact("a.out") + "\n") From 850dde063b7f70bb592723064385e9f9ad39c96e Mon Sep 17 00:00:00 2001 From: Yeting Kuo <46629943+yetingk@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:17:15 +0800 Subject: [PATCH 309/351] [RISCV][VP] Introduce vp saturating addition/subtraction and RISC-V support. (#82370) This patch also pick the MatchContext framework from DAGCombiner to an indiviual header file to make the framework be used from other files in llvm/lib/CodeGen/SelectionDAG/. --- llvm/docs/LangRef.rst | 203 ++ llvm/include/llvm/IR/Intrinsics.td | 20 + llvm/include/llvm/IR/VPIntrinsics.def | 24 + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 137 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 37 +- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 + .../SelectionDAG/LegalizeVectorTypes.cpp | 16 +- llvm/lib/CodeGen/SelectionDAG/MatchContext.h | 175 ++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 12 +- .../RISCV/rvv/fixed-vectors-vsadd-vp.ll | 1701 ++++++++++++++ .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll | 1697 ++++++++++++++ .../RISCV/rvv/fixed-vectors-vssub-vp.ll | 1745 ++++++++++++++ .../RISCV/rvv/fixed-vectors-vssubu-vp.ll | 1740 ++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll | 2015 ++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll | 2014 ++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll | 2067 +++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll | 2065 ++++++++++++++++ llvm/unittests/IR/VPIntrinsicTest.cpp | 8 + 18 files changed, 15521 insertions(+), 157 deletions(-) create mode 100644 llvm/lib/CodeGen/SelectionDAG/MatchContext.h create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 8f4495e25d0fa..19ca9f6ae3fe3 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16749,6 +16749,7 @@ an operation is greater than the maximum value, the result is set (or "clamped") to this maximum. If it is below the minimum, it is clamped to this minimum. +.. _int_sadd_sat: '``llvm.sadd.sat.*``' Intrinsics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16798,6 +16799,8 @@ Examples %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 -5) ; %res = -8 +.. _int_uadd_sat: + '``llvm.uadd.sat.*``' Intrinsics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16845,6 +16848,8 @@ Examples %res = call i4 @llvm.uadd.sat.i4(i4 8, i4 8) ; %res = 15 +.. _int_ssub_sat: + '``llvm.ssub.sat.*``' Intrinsics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16893,6 +16898,8 @@ Examples %res = call i4 @llvm.ssub.sat.i4(i4 4, i4 -5) ; %res = 7 +.. _int_usub_sat: + '``llvm.usub.sat.*``' Intrinsics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -23610,6 +23617,202 @@ Examples: %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison +.. _int_vp_sadd_sat: + +'``llvm.vp.sadd.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sadd.sat.v16i32 (<16 x i32> <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.sadd.sat.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.sadd.sat.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated signed saturating addition of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sadd.sat``' intrinsic performs sadd.sat (:ref:`sadd.sat `) +of the first and second vector operands on each enabled lane. The result on +disabled lanes is a :ref:`poison value `. + + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison + + +.. _int_vp_uadd_sat: + +'``llvm.vp.uadd.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.uadd.sat.v16i32 (<16 x i32> <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.uadd.sat.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.uadd.sat.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated unsigned saturating addition of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.uadd.sat``' intrinsic performs uadd.sat (:ref:`uadd.sat `) +of the first and second vector operands on each enabled lane. The result on +disabled lanes is a :ref:`poison value `. + + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison + + +.. _int_vp_ssub_sat: + +'``llvm.vp.ssub.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.ssub.sat.v16i32 (<16 x i32> <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.ssub.sat.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.ssub.sat.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated signed saturating subtraction of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.ssub.sat``' intrinsic performs ssub.sat (:ref:`ssub.sat `) +of the first and second vector operands on each enabled lane. The result on +disabled lanes is a :ref:`poison value `. + + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison + + +.. _int_vp_usub_sat: + +'``llvm.vp.usub.sat.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.usub.sat.v16i32 (<16 x i32> <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.usub.sat.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.usub.sat.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated unsigned saturating subtraction of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.usub.sat``' intrinsic performs usub.sat (:ref:`usub.sat `) +of the first and second vector operands on each enabled lane. The result on +disabled lanes is a :ref:`poison value `. + + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison + + .. _int_vp_fshl: '``llvm.vp.fshl.*``' Intrinsics diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 8c0d4d5db32d8..d7c1ce153a6c8 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1933,6 +1933,26 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; + def int_vp_sadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_uadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_ssub_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_usub_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; // Floating-point arithmetic def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 3b32b60609f53..4089acf9ec3f0 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -293,6 +293,30 @@ BEGIN_REGISTER_VP(vp_fshr, 3, 4, VP_FSHR, -1) VP_PROPERTY_FUNCTIONAL_INTRINSIC(fshr) VP_PROPERTY_FUNCTIONAL_SDOPC(FSHR) END_REGISTER_VP(vp_fshr, VP_FSHR) + +// llvm.vp.sadd.sat(x,y,mask,vlen) +BEGIN_REGISTER_VP(vp_sadd_sat, 2, 3, VP_SADDSAT, -1) +VP_PROPERTY_FUNCTIONAL_INTRINSIC(sadd_sat) +VP_PROPERTY_FUNCTIONAL_SDOPC(SADDSAT) +END_REGISTER_VP(vp_sadd_sat, VP_SADDSAT) + +// llvm.vp.uadd.sat(x,y,mask,vlen) +BEGIN_REGISTER_VP(vp_uadd_sat, 2, 3, VP_UADDSAT, -1) +VP_PROPERTY_FUNCTIONAL_INTRINSIC(uadd_sat) +VP_PROPERTY_FUNCTIONAL_SDOPC(UADDSAT) +END_REGISTER_VP(vp_uadd_sat, VP_UADDSAT) + +// llvm.vp.ssub.sat(x,y,mask,vlen) +BEGIN_REGISTER_VP(vp_ssub_sat, 2, 3, VP_SSUBSAT, -1) +VP_PROPERTY_FUNCTIONAL_INTRINSIC(ssub_sat) +VP_PROPERTY_FUNCTIONAL_SDOPC(SSUBSAT) +END_REGISTER_VP(vp_ssub_sat, VP_SSUBSAT) + +// llvm.vp.usub.sat(x,y,mask,vlen) +BEGIN_REGISTER_VP(vp_usub_sat, 2, 3, VP_USUBSAT, -1) +VP_PROPERTY_FUNCTIONAL_INTRINSIC(usub_sat) +VP_PROPERTY_FUNCTIONAL_SDOPC(USUBSAT) +END_REGISTER_VP(vp_usub_sat, VP_USUBSAT) ///// } Integer Arithmetic ///// Floating-Point Arithmetic { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ed43dd7f52882..6a28bc8da223b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -76,6 +76,8 @@ #include #include +#include "MatchContext.h" + using namespace llvm; #define DEBUG_TYPE "dagcombine" @@ -888,141 +890,6 @@ class WorklistInserter : public SelectionDAG::DAGUpdateListener { void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } }; -class EmptyMatchContext { - SelectionDAG &DAG; - const TargetLowering &TLI; - -public: - EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) - : DAG(DAG), TLI(TLI) {} - - bool match(SDValue OpN, unsigned Opcode) const { - return Opcode == OpN->getOpcode(); - } - - // Same as SelectionDAG::getNode(). - template SDValue getNode(ArgT &&...Args) { - return DAG.getNode(std::forward(Args)...); - } - - bool isOperationLegalOrCustom(unsigned Op, EVT VT, - bool LegalOnly = false) const { - return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly); - } -}; - -class VPMatchContext { - SelectionDAG &DAG; - const TargetLowering &TLI; - SDValue RootMaskOp; - SDValue RootVectorLenOp; - -public: - VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) - : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() { - assert(Root->isVPOpcode()); - if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode())) - RootMaskOp = Root->getOperand(*RootMaskPos); - else if (Root->getOpcode() == ISD::VP_SELECT) - RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root), - Root->getOperand(0).getValueType()); - - if (auto RootVLenPos = - ISD::getVPExplicitVectorLengthIdx(Root->getOpcode())) - RootVectorLenOp = Root->getOperand(*RootVLenPos); - } - - /// whether \p OpVal is a node that is functionally compatible with the - /// NodeType \p Opc - bool match(SDValue OpVal, unsigned Opc) const { - if (!OpVal->isVPOpcode()) - return OpVal->getOpcode() == Opc; - - auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), - !OpVal->getFlags().hasNoFPExcept()); - if (BaseOpc != Opc) - return false; - - // Make sure the mask of OpVal is true mask or is same as Root's. - unsigned VPOpcode = OpVal->getOpcode(); - if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) { - SDValue MaskOp = OpVal.getOperand(*MaskPos); - if (RootMaskOp != MaskOp && - !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode())) - return false; - } - - // Make sure the EVL of OpVal is same as Root's. - if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode)) - if (RootVectorLenOp != OpVal.getOperand(*VLenPos)) - return false; - return true; - } - - // Specialize based on number of operands. - // TODO emit VP intrinsics where MaskOp/VectorLenOp != null - // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return - // DAG.getNode(Opcode, DL, VT); } - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 1 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); - return DAG.getNode(VPOpcode, DL, VT, - {Operand, RootMaskOp, RootVectorLenOp}); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 2 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); - return DAG.getNode(VPOpcode, DL, VT, - {N1, N2, RootMaskOp, RootVectorLenOp}); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDValue N3) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 3 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); - return DAG.getNode(VPOpcode, DL, VT, - {N1, N2, N3, RootMaskOp, RootVectorLenOp}); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand, - SDNodeFlags Flags) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 1 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); - return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp}, - Flags); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDNodeFlags Flags) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 2 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); - return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}, - Flags); - } - - SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, - SDValue N2, SDValue N3, SDNodeFlags Flags) { - unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); - assert(ISD::getVPMaskIdx(VPOpcode) == 3 && - ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); - return DAG.getNode(VPOpcode, DL, VT, - {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags); - } - - bool isOperationLegalOrCustom(unsigned Op, EVT VT, - bool LegalOnly = false) const { - unsigned VPOp = ISD::getVPForBaseOpcode(Op); - return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly); - } -}; - } // end anonymous namespace //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a4ba261686c68..df17d6530b0de 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -217,7 +217,15 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::SSUBSAT: case ISD::USUBSAT: case ISD::SSHLSAT: - case ISD::USHLSAT: Res = PromoteIntRes_ADDSUBSHLSAT(N); break; + case ISD::USHLSAT: + Res = PromoteIntRes_ADDSUBSHLSAT(N); + break; + case ISD::VP_SADDSAT: + case ISD::VP_UADDSAT: + case ISD::VP_SSUBSAT: + case ISD::VP_USUBSAT: + Res = PromoteIntRes_ADDSUBSHLSAT(N); + break; case ISD::SMULFIX: case ISD::SMULFIXSAT: @@ -934,6 +942,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT); } +template SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { // If the promoted type is legal, we can convert this to: // 1. ANY_EXTEND iN to iM @@ -945,11 +954,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { SDLoc dl(N); SDValue Op1 = N->getOperand(0); SDValue Op2 = N->getOperand(1); + MatchContextClass matcher(DAG, TLI, N); unsigned OldBits = Op1.getScalarValueSizeInBits(); - unsigned Opcode = N->getOpcode(); + unsigned Opcode = matcher.getRootBaseOpcode(); bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT; + // FIXME: We need vp-aware PromotedInteger functions. SDValue Op1Promoted, Op2Promoted; if (IsShift) { Op1Promoted = GetPromotedInteger(Op1); @@ -968,18 +979,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); SDValue Add = - DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); + matcher.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); + return matcher.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); } // USUBSAT can always be promoted as long as we have zero-extended the args. if (Opcode == ISD::USUBSAT) - return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted, - Op2Promoted); + return matcher.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted, + Op2Promoted); // Shift cannot use a min/max expansion, we can't detect overflow if all of // the bits have been shifted out. - if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) { + if (IsShift || matcher.isOperationLegal(Opcode, PromotedType)) { unsigned ShiftOp; switch (Opcode) { case ISD::SADDSAT: @@ -1002,11 +1013,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount); if (!IsShift) Op2Promoted = - DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount); + matcher.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount); SDValue Result = - DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); + matcher.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); + return matcher.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; @@ -1015,9 +1026,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) { SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); SDValue Result = - DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); - Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); - Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); + matcher.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); + Result = matcher.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); + Result = matcher.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); return Result; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9114987162857..3c84f67653eca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H #define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H +#include "MatchContext.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -355,6 +356,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_VSCALE(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); + template SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N); SDValue PromoteIntRes_MULFIX(SDNode *N); SDValue PromoteIntRes_DIVFIX(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 7fc252600534f..90cda2a1155b6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1163,10 +1163,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::VP_SMAX: case ISD::UMIN: case ISD::VP_UMIN: case ISD::UMAX: case ISD::VP_UMAX: - case ISD::SADDSAT: - case ISD::UADDSAT: - case ISD::SSUBSAT: - case ISD::USUBSAT: + case ISD::SADDSAT: case ISD::VP_SADDSAT: + case ISD::UADDSAT: case ISD::VP_UADDSAT: + case ISD::SSUBSAT: case ISD::VP_SSUBSAT: + case ISD::USUBSAT: case ISD::VP_USUBSAT: case ISD::SSHLSAT: case ISD::USHLSAT: case ISD::ROTL: @@ -4140,10 +4140,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::VP_SMAX: case ISD::UMIN: case ISD::VP_UMIN: case ISD::UMAX: case ISD::VP_UMAX: - case ISD::UADDSAT: - case ISD::SADDSAT: - case ISD::USUBSAT: - case ISD::SSUBSAT: + case ISD::UADDSAT: case ISD::VP_UADDSAT: + case ISD::SADDSAT: case ISD::VP_SADDSAT: + case ISD::USUBSAT: case ISD::VP_USUBSAT: + case ISD::SSUBSAT: case ISD::VP_SSUBSAT: case ISD::SSHLSAT: case ISD::USHLSAT: case ISD::ROTL: diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h new file mode 100644 index 0000000000000..f965cb952f97a --- /dev/null +++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h @@ -0,0 +1,175 @@ +//===---------------- llvm/CodeGen/MatchContext.h --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the EmptyMatchContext class and VPMatchContext class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H +#define LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H + +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" + +using namespace llvm; + +namespace { +class EmptyMatchContext { + SelectionDAG &DAG; + const TargetLowering &TLI; + SDNode *Root; + +public: + EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) + : DAG(DAG), TLI(TLI), Root(Root) {} + + unsigned getRootBaseOpcode() { return Root->getOpcode(); } + bool match(SDValue OpN, unsigned Opcode) const { + return Opcode == OpN->getOpcode(); + } + + // Same as SelectionDAG::getNode(). + template SDValue getNode(ArgT &&...Args) { + return DAG.getNode(std::forward(Args)...); + } + + bool isOperationLegal(unsigned Op, EVT VT) const { + return TLI.isOperationLegal(Op, VT); + } + + bool isOperationLegalOrCustom(unsigned Op, EVT VT, + bool LegalOnly = false) const { + return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly); + } +}; + +class VPMatchContext { + SelectionDAG &DAG; + const TargetLowering &TLI; + SDValue RootMaskOp; + SDValue RootVectorLenOp; + SDNode *Root; + +public: + VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *_Root) + : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() { + Root = _Root; + assert(Root->isVPOpcode()); + if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode())) + RootMaskOp = Root->getOperand(*RootMaskPos); + else if (Root->getOpcode() == ISD::VP_SELECT) + RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root), + Root->getOperand(0).getValueType()); + + if (auto RootVLenPos = ISD::getVPExplicitVectorLengthIdx(Root->getOpcode())) + RootVectorLenOp = Root->getOperand(*RootVLenPos); + } + + unsigned getRootBaseOpcode() { + std::optional Opcode = ISD::getBaseOpcodeForVP( + Root->getOpcode(), !Root->getFlags().hasNoFPExcept()); + assert(Opcode.has_value()); + return *Opcode; + } + + /// whether \p OpVal is a node that is functionally compatible with the + /// NodeType \p Opc + bool match(SDValue OpVal, unsigned Opc) const { + if (!OpVal->isVPOpcode()) + return OpVal->getOpcode() == Opc; + + auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), + !OpVal->getFlags().hasNoFPExcept()); + if (BaseOpc != Opc) + return false; + + // Make sure the mask of OpVal is true mask or is same as Root's. + unsigned VPOpcode = OpVal->getOpcode(); + if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) { + SDValue MaskOp = OpVal.getOperand(*MaskPos); + if (RootMaskOp != MaskOp && + !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode())) + return false; + } + + // Make sure the EVL of OpVal is same as Root's. + if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode)) + if (RootVectorLenOp != OpVal.getOperand(*VLenPos)) + return false; + return true; + } + + // Specialize based on number of operands. + // TODO emit VP intrinsics where MaskOp/VectorLenOp != null + // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return + // DAG.getNode(Opcode, DL, VT); } + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 1 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); + return DAG.getNode(VPOpcode, DL, VT, + {Operand, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 2 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); + return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDValue N3) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 3 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, N3, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand, + SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 1 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); + return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp}, + Flags); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 2 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); + return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}, + Flags); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDValue N3, SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 3 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags); + } + + bool isOperationLegal(unsigned Op, EVT VT) const { + unsigned VPOp = ISD::getVPForBaseOpcode(Op); + return TLI.isOperationLegal(VPOp, VT); + } + + bool isOperationLegalOrCustom(unsigned Op, EVT VT, + bool LegalOnly = false) const { + unsigned VPOp = ISD::getVPForBaseOpcode(Op); + return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly); + } +}; +} // end anonymous namespace +#endif diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7540b22d13b7f..540c2e7476dc1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND, ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN, ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX, - ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE}; + ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, + ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT, + ISD::VP_USUBSAT}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, @@ -5752,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) { VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP VP_CASE(BITREVERSE) // VP_BITREVERSE + VP_CASE(SADDSAT) // VP_SADDSAT + VP_CASE(UADDSAT) // VP_UADDSAT + VP_CASE(SSUBSAT) // VP_SSUBSAT + VP_CASE(USUBSAT) // VP_USUBSAT VP_CASE(BSWAP) // VP_BSWAP VP_CASE(CTLZ) // VP_CTLZ VP_CASE(CTTZ) // VP_CTTZ @@ -6791,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_UDIV: case ISD::VP_SREM: case ISD::VP_UREM: + case ISD::VP_UADDSAT: + case ISD::VP_USUBSAT: + case ISD::VP_SADDSAT: + case ISD::VP_SSUBSAT: return lowerVPOp(Op, DAG); case ISD::VP_AND: case ISD::VP_OR: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll new file mode 100644 index 0000000000000..6c5dd0403dff1 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll @@ -0,0 +1,1701 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) + +define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vadd.vv v9, v9, v9 +; CHECK-NEXT: vsra.vi v9, v9, 1 +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 192 +; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %v = call <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) + ret <8 x i7> %v +} + +declare <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) + +define <2 x i8> @vsadd_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsadd_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsadd_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsadd_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsadd_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsadd_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +declare <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) + +define <4 x i8> @vsadd_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsadd_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsadd_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsadd_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsadd_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsadd_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsadd_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +declare <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32) + +define <5 x i8> @vsadd_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsadd_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsadd_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsadd_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsadd_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsadd_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +declare <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) + +define <8 x i8> @vsadd_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsadd_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsadd_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsadd_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsadd_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsadd_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +declare <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32) + +define <16 x i8> @vsadd_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsadd_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsadd_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsadd_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsadd_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsadd_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +declare <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32) + +define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v258i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: addi a0, a1, -128 +; CHECK-NEXT: sltu a3, a1, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v258i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB33_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: addi a1, a0, -128 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %head = insertelement <256 x i1> poison, i1 true, i32 0 + %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +; Test splitting when the %evl is a known constant. + +define <256 x i8> @vsadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vsadd_vi_v258i8_evl129: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129) + ret <256 x i8> %v +} + +; FIXME: The upper half is doing nothing. + +define <256 x i8> @vsadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vsadd_vi_v258i8_evl128: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128) + ret <256 x i8> %v +} + +declare <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) + +define <2 x i16> @vsadd_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsadd_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsadd_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsadd_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsadd_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsadd_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +declare <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32) + +define <4 x i16> @vsadd_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsadd_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsadd_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsadd_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsadd_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsadd_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +declare <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32) + +define <8 x i16> @vsadd_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsadd_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsadd_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsadd_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsadd_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsadd_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +declare <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32) + +define <16 x i16> @vsadd_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsadd_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsadd_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsadd_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsadd_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsadd_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +declare <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32) + +define <2 x i32> @vsadd_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsadd_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsadd_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsadd_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsadd_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsadd_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsadd_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsadd_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsadd_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsadd_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsadd_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsadd_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +declare <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) + +define <8 x i32> @vsadd_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsadd_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsadd_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsadd_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsadd_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsadd_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +declare <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32) + +define <16 x i32> @vsadd_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsadd_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsadd_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsadd_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsadd_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsadd_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +declare <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32) + +define <2 x i64> @vsadd_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsadd_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsadd_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsadd_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsadd_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsadd_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +declare <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32) + +define <4 x i64> @vsadd_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsadd_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsadd_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsadd_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsadd_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsadd_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +declare <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32) + +define <8 x i64> @vsadd_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsadd_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsadd_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsadd_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsadd_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsadd_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +declare <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32) + +define <16 x i64> @vsadd_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsadd_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsadd_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsadd_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v16i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v16i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsadd_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +; Test that split-legalization works as expected. + +declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) + +define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB108_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB108_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsadd.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: li a2, 16 +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB108_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB108_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsadd.vi v8, v8, -1, v0.t +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vsadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vi_v32i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB109_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB109_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v24 +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v16, v16, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vi_v32i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: li a2, 16 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB109_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB109_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsadd.vi v8, v8, -1 +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsadd.vi v16, v16, -1 +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %head = insertelement <32 x i1> poison, i1 true, i32 0 + %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +; FIXME: We don't match vsadd.vi on RV32. + +define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vsadd_vx_v32i64_evl12: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsadd.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v32i64_evl12: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV64-NEXT: vsadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vsadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12) + ret <32 x i64> %v +} + +define <32 x i64> @vsadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vsadd_vx_v32i64_evl27: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsadd.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_v32i64_evl27: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vsadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27) + ret <32 x i64> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll new file mode 100644 index 0000000000000..6227f8abe599e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll @@ -0,0 +1,1697 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) + +define <8 x i7> @vsaddu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 127 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: vminu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) + ret <8 x i7> %v +} + +declare <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) + +define <2 x i8> @vsaddu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsaddu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsaddu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsaddu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsaddu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vsaddu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +declare <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) + +define <4 x i8> @vsaddu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsaddu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsaddu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsaddu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsaddu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsaddu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vsaddu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +declare <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32) + +define <5 x i8> @vsaddu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsaddu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsaddu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsaddu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsaddu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vsaddu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +declare <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) + +define <8 x i8> @vsaddu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsaddu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsaddu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsaddu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsaddu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vsaddu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +declare <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32) + +define <16 x i8> @vsaddu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsaddu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsaddu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsaddu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsaddu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vsaddu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +declare <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32) + +define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v258i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: addi a0, a1, -128 +; CHECK-NEXT: sltu a3, a1, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v258i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB33_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: addi a1, a0, -128 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %head = insertelement <256 x i1> poison, i1 true, i32 0 + %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +; Test splitting when the %evl is a known constant. + +define <256 x i8> @vsaddu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vsaddu_vi_v258i8_evl129: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129) + ret <256 x i8> %v +} + +; FIXME: The upper half is doing nothing. + +define <256 x i8> @vsaddu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vsaddu_vi_v258i8_evl128: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128) + ret <256 x i8> %v +} + +declare <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) + +define <2 x i16> @vsaddu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsaddu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsaddu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsaddu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsaddu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vsaddu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +declare <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32) + +define <4 x i16> @vsaddu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsaddu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsaddu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsaddu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsaddu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vsaddu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +declare <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32) + +define <8 x i16> @vsaddu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsaddu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsaddu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsaddu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsaddu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vsaddu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +declare <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32) + +define <16 x i16> @vsaddu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsaddu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsaddu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsaddu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsaddu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vsaddu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +declare <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32) + +define <2 x i32> @vsaddu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsaddu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsaddu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsaddu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsaddu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vsaddu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsaddu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsaddu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsaddu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsaddu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsaddu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vsaddu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +declare <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) + +define <8 x i32> @vsaddu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsaddu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsaddu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsaddu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsaddu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vsaddu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +declare <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32) + +define <16 x i32> @vsaddu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsaddu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsaddu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsaddu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsaddu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vsaddu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +declare <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32) + +define <2 x i64> @vsaddu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsaddu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsaddu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsaddu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsaddu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vsaddu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +declare <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32) + +define <4 x i64> @vsaddu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsaddu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsaddu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsaddu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsaddu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vsaddu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +declare <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32) + +define <8 x i64> @vsaddu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsaddu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsaddu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsaddu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsaddu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vsaddu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +declare <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32) + +define <16 x i64> @vsaddu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsaddu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsaddu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsaddu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v16i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v16i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsaddu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vsaddu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +; Test that split-legalization works as expected. + +declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) + +define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB108_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB108_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsaddu.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: li a2, 16 +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB108_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB108_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vi v8, v8, -1, v0.t +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vsaddu.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vi_v32i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB109_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB109_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v24 +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v16, v16, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vi_v32i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: li a2, 16 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB109_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB109_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vi v8, v8, -1 +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vi v16, v16, -1 +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %head = insertelement <32 x i1> poison, i1 true, i32 0 + %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +; FIXME: We don't match vsaddu.vi on RV32. + +define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vsaddu_vx_v32i64_evl12: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsaddu.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v32i64_evl12: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vsaddu.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12) + ret <32 x i64> %v +} + +define <32 x i64> @vsaddu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vsaddu_vx_v32i64_evl27: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vsaddu.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_v32i64_evl27: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vsaddu.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27) + ret <32 x i64> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll new file mode 100644 index 0000000000000..6360cf49d8d47 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll @@ -0,0 +1,1745 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) + +define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vadd.vv v9, v9, v9 +; CHECK-NEXT: vsra.vi v9, v9, 1 +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 192 +; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %v = call <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) + ret <8 x i7> %v +} + +declare <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) + +define <2 x i8> @vssub_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssub_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssub_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssub_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssub_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssub_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +declare <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) + +define <4 x i8> @vssub_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssub_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssub_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v9, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssub_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssub_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +declare <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32) + +define <5 x i8> @vssub_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssub_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssub_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssub_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssub_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssub_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +declare <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) + +define <8 x i8> @vssub_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssub_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssub_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssub_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +declare <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32) + +define <16 x i8> @vssub_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssub_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssub_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssub_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssub_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssub_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +declare <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32) + +define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v258i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: addi a0, a1, -128 +; CHECK-NEXT: sltu a3, a1, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a0 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v258i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB33_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a2 +; CHECK-NEXT: addi a1, a0, -128 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a2 +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %head = insertelement <256 x i1> poison, i1 true, i32 0 + %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +; Test splitting when the %evl is a known constant. + +define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vssub_vi_v258i8_evl129: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129) + ret <256 x i8> %v +} + +; FIXME: The upper half is doing nothing. + +define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vssub_vi_v258i8_evl128: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128) + ret <256 x i8> %v +} + +declare <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) + +define <2 x i16> @vssub_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssub_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssub_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssub_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +declare <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32) + +define <4 x i16> @vssub_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssub_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssub_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssub_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +declare <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32) + +define <8 x i16> @vssub_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssub_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssub_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssub_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssub_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssub_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +declare <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32) + +define <16 x i16> @vssub_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssub_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssub_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssub_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssub_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssub_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +declare <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32) + +define <2 x i32> @vssub_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssub_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssub_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssub_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vssub_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssub_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssub_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssub_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssub_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssub_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +declare <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) + +define <8 x i32> @vssub_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssub_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssub_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssub_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssub_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssub_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +declare <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32) + +define <16 x i32> @vssub_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssub_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssub_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssub_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssub_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssub_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +declare <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32) + +define <2 x i64> @vssub_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssub_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssub_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssub_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssub_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssub_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +declare <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32) + +define <4 x i64> @vssub_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssub_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssub_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssub_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssub_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssub_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +declare <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32) + +define <8 x i64> @vssub_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssub_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssub_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssub_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssub_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssub_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +declare <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32) + +define <16 x i64> @vssub_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssub_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssub_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssub_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v16i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v16i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssub_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssub_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +; Test that split-legalization works as expected. + +declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) + +define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB108_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB108_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vssub.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: li a2, 16 +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB108_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB108_2: +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a2, v0.t +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vssub.vx v16, v16, a2, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vssub_vi_v32i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB109_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB109_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v24 +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v16, v16, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vi_v32i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: li a2, 16 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB109_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB109_2: +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a2 +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v16, v16, a2 +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %head = insertelement <32 x i1> poison, i1 true, i32 0 + %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +; FIXME: We don't match vssub.vi on RV32. + +define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vssub_vx_v32i64_evl12: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vssub.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v32i64_evl12: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vssub.vx v16, v16, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12) + ret <32 x i64> %v +} + +define <32 x i64> @vssub_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vssub_vx_v32i64_evl27: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vssub.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_v32i64_evl27: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vssub.vx v16, v16, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27) + ret <32 x i64> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll new file mode 100644 index 0000000000000..6ea9758871230 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll @@ -0,0 +1,1740 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32) + +define <8 x i7> @vssubu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 127 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl) + ret <8 x i7> %v +} + +declare <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) + +define <2 x i8> @vssubu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssubu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssubu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssubu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssubu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +define <2 x i8> @vssubu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i8> %v +} + +declare <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) + +define <4 x i8> @vssubu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssubu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssubu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssubu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v9, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssubu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssubu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +define <4 x i8> @vssubu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i8> %v +} + +declare <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32) + +define <5 x i8> @vssubu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssubu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssubu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssubu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssubu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v5i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +define <5 x i8> @vssubu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v5i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer + %head = insertelement <5 x i1> poison, i1 true, i32 0 + %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer + %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl) + ret <5 x i8> %v +} + +declare <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) + +define <8 x i8> @vssubu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssubu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssubu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssubu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssubu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +define <8 x i8> @vssubu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i8> %v +} + +declare <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32) + +define <16 x i8> @vssubu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssubu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssubu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssubu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssubu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +define <16 x i8> @vssubu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i8> %v +} + +declare <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32) + +define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v258i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: addi a0, a1, -128 +; CHECK-NEXT: sltu a3, a1, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a0 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v258i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB33_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a2 +; CHECK-NEXT: addi a1, a0, -128 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a2 +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %head = insertelement <256 x i1> poison, i1 true, i32 0 + %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +; Test splitting when the %evl is a known constant. + +define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vssubu_vi_v258i8_evl129: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129) + ret <256 x i8> %v +} + +; FIXME: The upper half is doing nothing. + +define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vssubu_vi_v258i8_evl128: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v24, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128) + ret <256 x i8> %v +} + +declare <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) + +define <2 x i16> @vssubu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssubu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssubu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssubu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssubu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vssubu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +declare <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32) + +define <4 x i16> @vssubu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssubu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssubu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssubu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssubu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vssubu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +declare <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32) + +define <8 x i16> @vssubu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssubu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssubu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssubu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssubu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vssubu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +declare <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32) + +define <16 x i16> @vssubu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssubu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssubu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssubu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssubu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vssubu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0 + %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +declare <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32) + +define <2 x i32> @vssubu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssubu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssubu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssubu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssubu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vssubu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vssubu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssubu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssubu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssubu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssubu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vssubu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +declare <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) + +define <8 x i32> @vssubu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssubu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssubu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssubu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssubu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vssubu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +declare <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32) + +define <16 x i32> @vssubu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssubu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssubu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssubu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssubu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vssubu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0 + %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +declare <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32) + +define <2 x i64> @vssubu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssubu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssubu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssubu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssubu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vssubu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +declare <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32) + +define <4 x i64> @vssubu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssubu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssubu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssubu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssubu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vssubu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +declare <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32) + +define <8 x i64> @vssubu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssubu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssubu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssubu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssubu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vssubu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +declare <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32) + +define <16 x i64> @vssubu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssubu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssubu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssubu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v16i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v16i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssubu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vssubu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_v16i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +; Test that split-legalization works as expected. + +declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) + +define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB108_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB108_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vssubu.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: li a2, 16 +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB108_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB108_2: +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a2, v0.t +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vssubu.vx v16, v16, a2, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vi_v32i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB109_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB109_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v24 +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v16, v16, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vi_v32i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: li a2, 16 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB109_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB109_2: +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a2 +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v16, v16, a2 +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %head = insertelement <32 x i1> poison, i1 true, i32 0 + %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +; FIXME: We don't match vssubu.vi on RV32. + +define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vssubu_vx_v32i64_evl12: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vssubu.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v32i64_evl12: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vssubu.vx v16, v16, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12) + ret <32 x i64> %v +} + +define <32 x i64> @vssubu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vssubu_vx_v32i64_evl27: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vssubu.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_v32i64_evl27: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vssubu.vx v16, v16, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27) + ret <32 x i64> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll new file mode 100644 index 0000000000000..caaeae55ed78e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll @@ -0,0 +1,2015 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare @llvm.vp.sadd.sat.nxv8i7(, , , i32) + +define @vsadd_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 192 +; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i7 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i7( %a, %vb, %mask, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv1i8(, , , i32) + +define @vsadd_vv_nxv1i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv1i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i8_commute( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i8( %vb, %va, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv2i8(, , , i32) + +define @vsadd_vv_nxv2i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv2i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv3i8(, , , i32) + +define @vsadd_vv_nxv3i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv3i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv3i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv3i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv3i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv3i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv4i8(, , , i32) + +define @vsadd_vv_nxv4i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv4i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv8i8(, , , i32) + +define @vsadd_vv_nxv8i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv8i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv16i8(, , , i32) + +define @vsadd_vv_nxv16i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv16i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv16i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv16i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv16i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv16i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv32i8(, , , i32) + +define @vsadd_vv_nxv32i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv32i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv32i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv32i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv32i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv32i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv64i8(, , , i32) + +define @vsadd_vv_nxv64i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv64i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv64i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv64i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv64i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv64i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works when the mask itself needs splitting. + +declare @llvm.vp.sadd.sat.nxv128i8(, , , i32) + +define @vsadd_vi_nxv128i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub a2, a1, a0 +; CHECK-NEXT: sltu a3, a1, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a1, a0, .LBB50_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: .LBB50_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv128i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv128i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1 +; CHECK-NEXT: bltu a0, a1, .LBB51_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB51_2: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv1i16(, , , i32) + +define @vsadd_vv_nxv1i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv1i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv2i16(, , , i32) + +define @vsadd_vv_nxv2i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv2i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv4i16(, , , i32) + +define @vsadd_vv_nxv4i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv4i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv8i16(, , , i32) + +define @vsadd_vv_nxv8i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv8i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv16i16(, , , i32) + +define @vsadd_vv_nxv16i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv16i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv16i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv16i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv32i16(, , , i32) + +define @vsadd_vv_nxv32i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv32i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv32i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv32i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv1i32(, , , i32) + +define @vsadd_vv_nxv1i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv1i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv2i32(, , , i32) + +define @vsadd_vv_nxv2i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv2i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv4i32(, , , i32) + +define @vsadd_vv_nxv4i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv4i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv8i32(, , , i32) + +define @vsadd_vv_nxv8i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv8i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv16i32(, , , i32) + +define @vsadd_vv_nxv16i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv16i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv16i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv16i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vx_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv16i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works then the mask needs manual splitting. + +declare @llvm.vp.sadd.sat.nxv32i32(, , , i32) + +define @vsadd_vi_nxv32i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a0, a1, .LBB118_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv32i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv32i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vi v16, v16, -1 +; CHECK-NEXT: bltu a0, a1, .LBB119_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv1i64(, , , i32) + +define @vsadd_vv_nxv1i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv1i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv1i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv1i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv1i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv2i64(, , , i32) + +define @vsadd_vv_nxv2i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv2i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv2i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv4i64(, , , i32) + +define @vsadd_vv_nxv4i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv4i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv4i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.sadd.sat.nxv8i64(, , , i32) + +define @vsadd_vv_nxv8i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sadd.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vv_nxv8i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vv_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vx_nxv8i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsadd_vx_nxv8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsadd_vx_nxv8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsadd_vi_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsadd_vi_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.sadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll new file mode 100644 index 0000000000000..c0779e508c0a9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll @@ -0,0 +1,2014 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare @llvm.vp.uadd.sat.nxv8i7(, , , i32) + +define @vsaddu_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v9, v9, a2 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: vminu.vx v8, v8, a2, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i7 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i7( %a, %vb, %mask, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv1i8(, , , i32) + +define @vsaddu_vv_nxv1i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv1i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i8_commute( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i8( %vb, %va, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv2i8(, , , i32) + +define @vsaddu_vv_nxv2i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv2i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv3i8(, , , i32) + +define @vsaddu_vv_nxv3i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv3i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv3i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv3i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv3i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv3i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv4i8(, , , i32) + +define @vsaddu_vv_nxv4i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv4i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv8i8(, , , i32) + +define @vsaddu_vv_nxv8i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv8i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv16i8(, , , i32) + +define @vsaddu_vv_nxv16i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv16i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv16i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv16i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv16i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv16i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv32i8(, , , i32) + +define @vsaddu_vv_nxv32i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv32i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv32i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv32i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv32i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv32i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv64i8(, , , i32) + +define @vsaddu_vv_nxv64i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv64i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv64i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv64i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv64i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv64i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works when the mask itself needs splitting. + +declare @llvm.vp.uadd.sat.nxv128i8(, , , i32) + +define @vsaddu_vi_nxv128i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub a2, a1, a0 +; CHECK-NEXT: sltu a3, a1, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a1, a0, .LBB50_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: .LBB50_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv128i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv128i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1 +; CHECK-NEXT: bltu a0, a1, .LBB51_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB51_2: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv1i16(, , , i32) + +define @vsaddu_vv_nxv1i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv1i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv2i16(, , , i32) + +define @vsaddu_vv_nxv2i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv2i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv4i16(, , , i32) + +define @vsaddu_vv_nxv4i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv4i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv8i16(, , , i32) + +define @vsaddu_vv_nxv8i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv8i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv16i16(, , , i32) + +define @vsaddu_vv_nxv16i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv16i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv16i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv16i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv32i16(, , , i32) + +define @vsaddu_vv_nxv32i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv32i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv32i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv32i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv1i32(, , , i32) + +define @vsaddu_vv_nxv1i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv1i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv2i32(, , , i32) + +define @vsaddu_vv_nxv2i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv2i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv4i32(, , , i32) + +define @vsaddu_vv_nxv4i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv4i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv8i32(, , , i32) + +define @vsaddu_vv_nxv8i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv8i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv16i32(, , , i32) + +define @vsaddu_vv_nxv16i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv16i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv16i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv16i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vx_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv16i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works then the mask needs manual splitting. + +declare @llvm.vp.uadd.sat.nxv32i32(, , , i32) + +define @vsaddu_vi_nxv32i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a0, a1, .LBB118_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv32i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv32i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v16, v16, -1 +; CHECK-NEXT: bltu a0, a1, .LBB119_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv1i64(, , , i32) + +define @vsaddu_vv_nxv1i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv1i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv1i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv1i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv1i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv2i64(, , , i32) + +define @vsaddu_vv_nxv2i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv2i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv2i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv4i64(, , , i32) + +define @vsaddu_vv_nxv4i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv4i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv4i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.uadd.sat.nxv8i64(, , , i32) + +define @vsaddu_vv_nxv8i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.uadd.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vv_nxv8i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vv_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vx_nxv8i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vsaddu_vx_nxv8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsaddu.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vsaddu_vx_nxv8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vsaddu_vi_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vsaddu_vi_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.uadd.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll new file mode 100644 index 0000000000000..2d51a2ee44f65 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll @@ -0,0 +1,2067 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare @llvm.vp.ssub.sat.nxv8i7(, , , i32) + +define @vssub_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t +; CHECK-NEXT: li a0, 192 +; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i7 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i7( %a, %vb, %mask, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv1i8(, , , i32) + +define @vssub_vv_nxv1i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv1i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i8_commute( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vv v8, v9, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i8( %vb, %va, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv2i8(, , , i32) + +define @vssub_vv_nxv2i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv2i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv3i8(, , , i32) + +define @vssub_vv_nxv3i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv3i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv3i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv3i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv3i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv3i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv4i8(, , , i32) + +define @vssub_vv_nxv4i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv4i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv8i8(, , , i32) + +define @vssub_vv_nxv8i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv8i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv16i8(, , , i32) + +define @vssub_vv_nxv16i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv16i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv16i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv16i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv16i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv16i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv32i8(, , , i32) + +define @vssub_vv_nxv32i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv32i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv32i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv32i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv32i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv32i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv64i8(, , , i32) + +define @vssub_vv_nxv64i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv64i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv64i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv64i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv64i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv64i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works when the mask itself needs splitting. + +declare @llvm.vp.ssub.sat.nxv128i8(, , , i32) + +define @vssub_vi_nxv128i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub a0, a1, a2 +; CHECK-NEXT: sltu a3, a1, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a0 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB50_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: .LBB50_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv128i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv128i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a2 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a2 +; CHECK-NEXT: bltu a0, a1, .LBB51_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB51_2: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a2 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv1i16(, , , i32) + +define @vssub_vv_nxv1i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv1i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv2i16(, , , i32) + +define @vssub_vv_nxv2i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv2i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv4i16(, , , i32) + +define @vssub_vv_nxv4i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv4i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv8i16(, , , i32) + +define @vssub_vv_nxv8i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv8i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv16i16(, , , i32) + +define @vssub_vv_nxv16i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv16i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv16i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv16i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv32i16(, , , i32) + +define @vssub_vv_nxv32i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv32i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv32i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv32i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv1i32(, , , i32) + +define @vssub_vv_nxv1i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv1i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv2i32(, , , i32) + +define @vssub_vv_nxv2i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv2i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv4i32(, , , i32) + +define @vssub_vv_nxv4i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv4i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv8i32(, , , i32) + +define @vssub_vv_nxv8i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv8i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv16i32(, , , i32) + +define @vssub_vv_nxv16i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv16i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv16i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv16i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vx_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv16i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works then the mask needs manual splitting. + +declare @llvm.vp.ssub.sat.nxv32i32(, , , i32) + +define @vssub_vi_nxv32i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a1, a2, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: sub a1, a0, a2 +; CHECK-NEXT: sltu a3, a0, a1 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a1 +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a1, v0.t +; CHECK-NEXT: bltu a0, a2, .LBB118_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv32i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv32i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a2 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v16, v16, a2 +; CHECK-NEXT: bltu a0, a1, .LBB119_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a2 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv1i64(, , , i32) + +define @vssub_vv_nxv1i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv1i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv1i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv1i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv1i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv2i64(, , , i32) + +define @vssub_vv_nxv2i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv2i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv2i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv4i64(, , , i32) + +define @vssub_vv_nxv4i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv4i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv4i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.ssub.sat.nxv8i64(, , , i32) + +define @vssub_vv_nxv8i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.ssub.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vv_nxv8i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vv_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vx_nxv8i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssub_vx_nxv8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssub.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssub_vx_nxv8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssub_vi_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssub_vi_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.ssub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll new file mode 100644 index 0000000000000..e5589ce1a9bc6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll @@ -0,0 +1,2065 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare @llvm.vp.usub.sat.nxv8i7(, , , i32) + +define @vssubu_vx_nxv8i7( %a, i7 signext %b, %mask, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i7: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 127 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vand.vx v9, v9, a2 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i7 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i7( %a, %vb, %mask, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv1i8(, , , i32) + +define @vssubu_vv_nxv1i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv1i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i8_commute( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i8_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v9, v8, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i8( %vb, %va, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv2i8(, , , i32) + +define @vssubu_vv_nxv2i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv2i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv3i8(, , , i32) + +define @vssubu_vv_nxv3i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv3i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv3i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv3i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv3i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv3i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv3i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv3i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv3i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv4i8(, , , i32) + +define @vssubu_vv_nxv4i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv4i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv8i8(, , , i32) + +define @vssubu_vv_nxv8i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv8i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv16i8(, , , i32) + +define @vssubu_vv_nxv16i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv16i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv16i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv16i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv16i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv16i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv16i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv32i8(, , , i32) + +define @vssubu_vv_nxv32i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv32i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv32i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv32i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv32i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv32i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv32i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv64i8(, , , i32) + +define @vssubu_vv_nxv64i8( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv64i8_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv64i8( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv64i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv64i8_unmasked( %va, i8 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv64i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv64i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv64i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv64i8( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works when the mask itself needs splitting. + +declare @llvm.vp.usub.sat.nxv128i8(, , , i32) + +define @vssubu_vi_nxv128i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma +; CHECK-NEXT: vlm.v v0, (a0) +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub a0, a1, a2 +; CHECK-NEXT: sltu a3, a1, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a0 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB50_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: .LBB50_2: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv128i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv128i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a2 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a2 +; CHECK-NEXT: bltu a0, a1, .LBB51_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB51_2: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a2 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i8 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv1i16(, , , i32) + +define @vssubu_vv_nxv1i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv1i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv2i16(, , , i32) + +define @vssubu_vv_nxv2i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv2i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv4i16(, , , i32) + +define @vssubu_vv_nxv4i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv4i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv8i16(, , , i32) + +define @vssubu_vv_nxv8i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv8i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv16i16(, , , i32) + +define @vssubu_vv_nxv16i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv16i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv16i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv16i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv32i16(, , , i32) + +define @vssubu_vv_nxv32i16( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv32i16_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i16( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv32i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv32i16_unmasked( %va, i16 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i16 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i16( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv1i32(, , , i32) + +define @vssubu_vv_nxv1i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv1i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv2i32(, , , i32) + +define @vssubu_vv_nxv2i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv2i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv4i32(, , , i32) + +define @vssubu_vv_nxv4i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv4i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv8i32(, , , i32) + +define @vssubu_vv_nxv8i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv8i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv16i32(, , , i32) + +define @vssubu_vv_nxv16i32( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv16i32_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i32( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv16i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv16i32_unmasked( %va, i32 %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vx_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a0 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv16i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv16i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv16i32( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test that split-legalization works then the mask needs manual splitting. + +declare @llvm.vp.usub.sat.nxv32i32(, , , i32) + +define @vssubu_vi_nxv32i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a1, a2, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: sub a1, a0, a2 +; CHECK-NEXT: sltu a3, a0, a1 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a1 +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a1, v0.t +; CHECK-NEXT: bltu a0, a2, .LBB118_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv32i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv32i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a2 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v16, v16, a2 +; CHECK-NEXT: bltu a0, a1, .LBB119_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a2 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i32 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv1i64(, , , i32) + +define @vssubu_vv_nxv1i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv1i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v9, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv1i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv1i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v9 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv1i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv1i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv1i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv2i64(, , , i32) + +define @vssubu_vv_nxv2i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv2i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv2i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v10 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv2i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv2i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv4i64(, , , i32) + +define @vssubu_vv_nxv4i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv4i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv4i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v12 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv4i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv4i64( %va, %vb, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.usub.sat.nxv8i64(, , , i32) + +define @vssubu_vv_nxv8i64( %va, %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.usub.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vv_nxv8i64_unmasked( %va, %b, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vv_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i64( %va, %b, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i64( %va, i64 %b, %m, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v16, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vx_nxv8i64_unmasked( %va, i64 %b, i32 zeroext %evl) { +; RV32-LABEL: vssubu_vx_nxv8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vssubu.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vssubu_vx_nxv8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret + %elt.head = insertelement poison, i64 %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vssubu_vi_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vssubu_vi_nxv8i64_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 +; CHECK-NEXT: ret + %elt.head = insertelement poison, i64 -1, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.usub.sat.nxv8i64( %va, %vb, %m, i32 %evl) + ret %v +} diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index 7a9d91cc76b34..e3462f0f33f11 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -163,6 +163,14 @@ class VPIntrinsicTest : public testing::Test { << "(<8 x i16>, i1 immarg, <8 x i1>, i32) "; Str << " declare <8 x i16> @llvm.vp.cttz.v8i16" << "(<8 x i16>, i1 immarg, <8 x i1>, i32) "; + Str << " declare <8 x i16> @llvm.vp.sadd.sat.v8i16" + << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) "; + Str << " declare <8 x i16> @llvm.vp.uadd.sat.v8i16" + << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) "; + Str << " declare <8 x i16> @llvm.vp.ssub.sat.v8i16" + << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) "; + Str << " declare <8 x i16> @llvm.vp.usub.sat.v8i16" + << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) "; Str << " declare <8 x i16> @llvm.vp.fshl.v8i16" << "(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i1>, i32) "; Str << " declare <8 x i16> @llvm.vp.fshr.v8i16" From d7a28f7ad77504694ad8bdc6b2aaa8938f08fbdd Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 23 Feb 2024 14:34:57 +0800 Subject: [PATCH 310/351] [RISCV] Add asserts for insert/extract_subvector invariants. NFC We can currently select insert_subvector and extract_subvector nodes in RISCVISelDAGToDAG (this is after custom legalizing in RISCVISelLowering) with fixed subvector types. However decomposeSubvectorInsertExtractToSubRegs is based off of scalable subvectors where the indices are scaled by vscale, so any index other than 0 will be wrong. For insert_subvector the vector being inserted into needs to be undef as well, because it assumes we can replace a whole subregister which isn't always the case for fixed subvectors (e.g. insert <2 x i32> into <4 x i32> at index 0 with vlen=128). We currently maintain these invariants in RISCVISelLowering, so this adds asserts in RISCVISelDAGToDAG so we don't break them. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 904f1d7fdf906..c922098c55094 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); MVT SubVecContainerVT = SubVecVT; // Establish the correct scalable-vector types for any fixed-length type. - if (SubVecVT.isFixedLengthVector()) + if (SubVecVT.isFixedLengthVector()) { + assert(Idx == 0 && V.isUndef()); SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT); + } if (VT.isFixedLengthVector()) VT = TLI.getContainerForFixedLengthVector(VT); @@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering(); MVT SubVecContainerVT = VT; // Establish the correct scalable-vector types for any fixed-length type. - if (VT.isFixedLengthVector()) + if (VT.isFixedLengthVector()) { + assert(Idx == 0); SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT); + } if (InVT.isFixedLengthVector()) InVT = TLI.getContainerForFixedLengthVector(InVT); From 1fe6be8794964c011aeba7a66bd2dcd891d21ab0 Mon Sep 17 00:00:00 2001 From: Freddy Ye Date: Fri, 23 Feb 2024 15:18:42 +0800 Subject: [PATCH 311/351] [X86] Support APXF to enable __builtin_cpu_supports. (#80636) For referring, APX's spec: https://cdrdv2.intel.com/v1/dl/getContent/784266 APX's index in libgcc: https://github.com/gcc-mirror/gcc/blob/master/gcc/common/config/i386/i386-cpuinfo.h#L267 --- clang/lib/Headers/cpuid.h | 1 + clang/test/CodeGen/target-builtin-noerror.c | 1 + compiler-rt/lib/builtins/cpu_model/x86.c | 6 ++++-- llvm/include/llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/TargetParser/Host.cpp | 7 +++++++ 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h index c968d37fb8cd6..0bb9912b465ff 100644 --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -219,6 +219,7 @@ #define bit_PREFETCHI 0x00004000 #define bit_USERMSR 0x00008000 #define bit_AVX10 0x00080000 +#define bit_APXF 0x00200000 /* Features in %eax for leaf 13 sub-leaf 1 */ #define bit_XSAVEOPT 0x00000001 diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c index 9608b5f37baaa..b438e50848a4b 100644 --- a/clang/test/CodeGen/target-builtin-noerror.c +++ b/clang/test/CodeGen/target-builtin-noerror.c @@ -141,6 +141,7 @@ void verifyfeaturestrings(void) { (void)__builtin_cpu_supports("sm3"); (void)__builtin_cpu_supports("sha512"); (void)__builtin_cpu_supports("sm4"); + (void)__builtin_cpu_supports("apxf"); (void)__builtin_cpu_supports("usermsr"); (void)__builtin_cpu_supports("avx10.1-256"); (void)__builtin_cpu_supports("avx10.1-512"); diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c index 1afa468c4ae8c..7e8acb3e73eda 100644 --- a/compiler-rt/lib/builtins/cpu_model/x86.c +++ b/compiler-rt/lib/builtins/cpu_model/x86.c @@ -217,8 +217,8 @@ enum ProcessorFeatures { FEATURE_SM3, FEATURE_SHA512, FEATURE_SM4, - // FEATURE_APXF, - FEATURE_USERMSR = 112, + FEATURE_APXF, + FEATURE_USERMSR, FEATURE_AVX10_1_256, FEATURE_AVX10_1_512, CPU_FEATURE_MAX @@ -983,6 +983,8 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, setFeature(FEATURE_USERMSR); if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1)) setFeature(FEATURE_AVX10_1_256); + if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1)) + setFeature(FEATURE_APXF); unsigned MaxLevel; getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX); diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 4c630c1eb06e8..a9ed56fcd4700 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -265,6 +265,7 @@ X86_MICROARCH_LEVEL(X86_64_BASELINE,"x86-64", 95) X86_MICROARCH_LEVEL(X86_64_V2, "x86-64-v2", 96) X86_MICROARCH_LEVEL(X86_64_V3, "x86-64-v3", 97) X86_MICROARCH_LEVEL(X86_64_V4, "x86-64-v4", 98) +X86_MICROARCH_LEVEL(APXF, "apxf", 111) #undef X86_FEATURE_COMPAT #undef X86_FEATURE #undef X86_MICROARCH_LEVEL diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 4466d50458e19..a4cc757a9214e 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1846,6 +1846,13 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1); Features["usermsr"] = HasLeaf7Subleaf1 && ((EDX >> 15) & 1); Features["avx10.1-256"] = HasLeaf7Subleaf1 && ((EDX >> 19) & 1); + bool HasAPXF = HasLeaf7Subleaf1 && ((EDX >> 21) & 1); + Features["egpr"] = HasAPXF; + Features["push2pop2"] = HasAPXF; + Features["ppx"] = HasAPXF; + Features["ndd"] = HasAPXF; + Features["ccmp"] = HasAPXF; + Features["cf"] = HasAPXF; bool HasLeafD = MaxLevel >= 0xd && !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX); From 354401f8d3dc08ed41895d03a12a122e9cc0482c Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 22 Feb 2024 23:53:12 -0800 Subject: [PATCH 312/351] [lldb] Fix GetTerminalWidth after afd469023aad afd469023aad fixed the type of the term-width setting but the getter (Debugger::GetTerminalWidth) was still trying to get the terminal width as an unsigned. This fixes TestXMLRegisterFlags.py. --- lldb/source/Core/Debugger.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index bb81110ae35a5..c3e603dbae896 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -365,7 +365,7 @@ bool Debugger::SetREPLLanguage(lldb::LanguageType repl_lang) { uint64_t Debugger::GetTerminalWidth() const { const uint32_t idx = ePropertyTerminalWidth; - return GetPropertyAtIndexAs( + return GetPropertyAtIndexAs( idx, g_debugger_properties[idx].default_uint_value); } From 531e8c26b3f2626e7f1a997e0e8b61d67d10aded Mon Sep 17 00:00:00 2001 From: Dani Date: Fri, 23 Feb 2024 09:04:33 +0100 Subject: [PATCH 313/351] [llvm][AArch64] Autoupgrade function attributes from Module attributes. (#80640) `sign-return-address` and similar module attributes should be propagated to the function level before modules got merged because module flags may contradict and this information is not recoverable. Generated code will match with the normal linking flow. --- llvm/include/llvm/IR/AutoUpgrade.h | 3 +- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/IR/AutoUpgrade.cpp | 72 ++++++++++++++++++- llvm/lib/Linker/IRMover.cpp | 4 ++ .../test/Bitcode/upgrade-arc-runtime-calls.ll | 4 +- .../AArch64/link-branch-target-enforcement.ll | 1 + .../LTO/AArch64/link-sign-return-address.ll | 43 +++++++++++ llvm/test/Linker/link-arm-and-thumb.ll | 7 +- 8 files changed, 128 insertions(+), 8 deletions(-) create mode 100644 llvm/test/LTO/AArch64/link-sign-return-address.ll diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h index 152f781ffa9b3..c0d96efc54752 100644 --- a/llvm/include/llvm/IR/AutoUpgrade.h +++ b/llvm/include/llvm/IR/AutoUpgrade.h @@ -67,7 +67,8 @@ namespace llvm { void UpgradeSectionAttributes(Module &M); /// Correct any IR that is relying on old function attribute behavior. - void UpgradeFunctionAttributes(Function &F); + void UpgradeFunctionAttributes(Function &F, + bool ModuleMetadataIsMaterialized = false); /// If the given TBAA tag uses the scalar TBAA format, create a new node /// corresponding to the upgrade to the struct-path aware TBAA format. diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 832907a3f53f5..8c860101afa02 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -6706,7 +6706,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) { } // Look for functions that rely on old function attribute behavior. - UpgradeFunctionAttributes(*F); + UpgradeFunctionAttributes(*F, true); // Bring in any functions that this function forward-referenced via // blockaddresses. diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b90bbe71ac189..edff13c796b31 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5155,7 +5155,46 @@ struct StrictFPUpgradeVisitor : public InstVisitor { }; } // namespace -void llvm::UpgradeFunctionAttributes(Function &F) { +// Check if the module attribute is present and not zero. +static bool isModuleAttributeSet(const Module *M, const StringRef &ModAttr) { + const auto *Attr = + mdconst::extract_or_null(M->getModuleFlag(ModAttr)); + return Attr && Attr->getZExtValue(); +} + +// Copy an attribute from module to the function if exists. +// First value of the pair is used when the module attribute is not zero +// the second otherwise. +static void +CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName, + StringRef ModAttrName, + std::pair Values) { + if (F.hasFnAttribute(FnAttrName)) + return; + F.addFnAttr(FnAttrName, isModuleAttributeSet(F.getParent(), ModAttrName) + ? Values.first + : Values.second); +} + +// Copy a boolean attribute from module to the function if exists. +// Module attribute treated false if zero otherwise true. +static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) { + CopyModuleAttributeToFunction( + F, AttrName, AttrName, + std::make_pair("true", "false")); +} + +// Copy an attribute from module to the function if exists. +// First value of the pair is used when the module attribute is not zero +// the second otherwise. +static void +CopyModuleAttributeToFunction(Function &F, StringRef AttrName, + std::pair Values) { + CopyModuleAttributeToFunction(F, AttrName, AttrName, Values); +} + +void llvm::UpgradeFunctionAttributes(Function &F, + bool ModuleMetadataIsMaterialized) { // If a function definition doesn't have the strictfp attribute, // convert any callsite strictfp attributes to nobuiltin. if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) { @@ -5167,6 +5206,37 @@ void llvm::UpgradeFunctionAttributes(Function &F) { F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType())); for (auto &Arg : F.args()) Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType())); + + if (!ModuleMetadataIsMaterialized) + return; + if (F.isDeclaration()) + return; + Module *M = F.getParent(); + if (!M) + return; + + Triple T(M->getTargetTriple()); + // Convert module level attributes to function level attributes because + // after merging modules the attributes might change and would have different + // effect on the functions as the original module would have. + if (T.isThumb() || T.isARM() || T.isAArch64()) { + if (!F.hasFnAttribute("sign-return-address")) { + StringRef SignType = "none"; + if (isModuleAttributeSet(M, "sign-return-address")) + SignType = "non-leaf"; + + if (isModuleAttributeSet(M, "sign-return-address-all")) + SignType = "all"; + + F.addFnAttr("sign-return-address", SignType); + } + CopyModuleAttributeToFunction(F, "branch-target-enforcement"); + CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr"); + CopyModuleAttributeToFunction(F, "guarded-control-stack"); + CopyModuleAttributeToFunction( + F, "sign-return-address-key", + std::make_pair("b_key", "a_key")); + } } static bool isOldLoopArgument(Metadata *MD) { diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 37d21119447b9..9f45ebc6eda01 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1606,6 +1606,10 @@ Error IRLinker::run() { // Loop over all of the linked values to compute type mappings. computeTypeMapping(); + // Update function attributes before copying them to destation module. + for (Function &F : SrcM->getFunctionList()) + UpgradeFunctionAttributes(F, true); + std::reverse(Worklist.begin(), Worklist.end()); while (!Worklist.empty()) { GlobalValue *GV = Worklist.back(); diff --git a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll index 19f25f98953fa..d2edec18d55e5 100644 --- a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll +++ b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll @@ -55,7 +55,7 @@ unwindBlock: // Check that auto-upgrader converts function calls to intrinsic calls. Note that // the auto-upgrader doesn't touch invoke instructions. -// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality +// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality // ARC: %[[V0:.*]] = tail call ptr @llvm.objc.autorelease(ptr %[[A]]) // ARC-NEXT: tail call void @llvm.objc.autoreleasePoolPop(ptr %[[A]]) // ARC-NEXT: %[[V1:.*]] = tail call ptr @llvm.objc.autoreleasePoolPush() @@ -88,7 +88,7 @@ unwindBlock: // ARC-NEXT: tail call void @llvm.objc.arc.annotation.bottomup.bbend(ptr %[[B]], ptr %[[C]]) // ARC-NEXT: invoke void @objc_autoreleasePoolPop(ptr %[[A]]) -// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality +// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality // NOUPGRADE: %[[V0:.*]] = tail call ptr @objc_autorelease(ptr %[[A]]) // NOUPGRADE-NEXT: tail call void @objc_autoreleasePoolPop(ptr %[[A]]) // NOUPGRADE-NEXT: %[[V1:.*]] = tail call ptr @objc_autoreleasePoolPush() diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll index ccf8cf67ede6d..74d9c86881d52 100644 --- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll +++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll @@ -32,6 +32,7 @@ entry: ; CHECK-DUMP:
: ; CHECK-DUMP: bl 0x8 ; CHECK-DUMP: : +; CHECK-DUMP: paciasp ; `main` doesn't support BTI while `foo` does, so in the binary ; we should see only PAC which is supported by both. diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll new file mode 100644 index 0000000000000..c25857ceed7b4 --- /dev/null +++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll @@ -0,0 +1,43 @@ +; Testcase to check that module with different branch-target-enforcement can +; be mixed. +; +; RUN: llvm-as %s -o %t1.bc +; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc +; RUN: llvm-lto -exported-symbol main \ +; RUN: -exported-symbol foo \ +; RUN: -filetype=obj \ +; RUN: %t2.bc %t1.bc \ +; RUN: -o %t1.exe 2>&1 +; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s +; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +declare i32 @foo(); + +define i32 @main() { +entry: + %add = call i32 @foo() + ret i32 %add +} + +!llvm.module.flags = !{!0, !1, !2, !3 } +!0 = !{i32 8, !"branch-target-enforcement", i32 0} +!1 = !{i32 8, !"sign-return-address", i32 0} +!2 = !{i32 8, !"sign-return-address-all", i32 0} +!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0} + +; CHECK-DUMP: : +; CHECK-DUMP: paciasp +; CHECK-DUMP: mov w0, #0x2a +; CHECK-DUMP: autiasp +; CHECK-DUMP: ret +; CHECK-DUMP:
: +; CHECK-DUMP-NOT: paciasp +; CHECK-DUMP: str x30, +; CHECK-DUMP: bl 0x14 + +; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary +; we should not see anything. +; CHECK-PROP-NOT: Properties: aarch64 feature: PAC \ No newline at end of file diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll index a90f2128e4430..37bd8c37f8b5e 100644 --- a/llvm/test/Linker/link-arm-and-thumb.ll +++ b/llvm/test/Linker/link-arm-and-thumb.ll @@ -13,11 +13,12 @@ entry: ret i32 %add } -; CHECK: define i32 @main() { +; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]] ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]] ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]] -; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" } -; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" } +; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} } +; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" } +; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" } ; STDERR-NOT: warning: Linking two modules of different target triples: From 6fae3e784472751002570f367c378cb2dbd82c26 Mon Sep 17 00:00:00 2001 From: Dani Date: Fri, 23 Feb 2024 09:30:36 +0100 Subject: [PATCH 314/351] [llvm][AArch64] Do not inline a function with different signing scheme. (#80642) If the signing scheme is different that maybe the functions assumes different behaviours and dangerous to inline them without analysing them. This should be a rare case. --- llvm/include/llvm/IR/Attributes.td | 28 +++-- llvm/lib/IR/Attributes.cpp | 5 + .../Inline/inline-sign-return-address.ll | 104 ++++++++++++++++++ llvm/utils/TableGen/Attributes.cpp | 6 +- 4 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 llvm/test/Transforms/Inline/inline-sign-return-address.ll diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 864f87f338389..d22eb76d2292d 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -339,14 +339,26 @@ def UseSampleProfile : StrBoolAttr<"use-sample-profile">; def DenormalFPMath : ComplexStrAttr<"denormal-fp-math", [FnAttr]>; def DenormalFPMathF32 : ComplexStrAttr<"denormal-fp-math-f32", [FnAttr]>; +// Attribute compatiblity rules are generated to check the attribute of the +// caller and callee and decide whether inlining should be allowed. CompatRule +// and child classes are used for the rule generation. CompatRule takes only a +// compare function which could be templated with the attribute type. +// CompatRuleStrAttr takes the compare function and the string attribute for +// checking compatibility for inline substitution. class CompatRule { - // The name of the function called to check the attribute of the caller and - // callee and decide whether inlining should be allowed. The function's - // signature must match "bool(const Function&, const Function &)", where the - // first parameter is the reference to the caller and the second parameter is - // the reference to the callee. It must return false if the attributes of the - // caller and callee are incompatible, and true otherwise. + // The function's signature must match "bool(const Function&, const + // Function&)", where the first parameter is the reference to the caller and + // the second parameter is the reference to the callee. It must return false + // if the attributes of the caller and callee are incompatible, and true + // otherwise. string CompatFunc = F; + string AttrName = ""; +} + +class CompatRuleStrAttr : CompatRule { + // The checker function is extended with an third argument as the function + // attribute string "bool(const Function&, const Function&, const StringRef&)". + string AttrName = Attr; } def : CompatRule<"isEqual">; @@ -359,7 +371,9 @@ def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"checkDenormMode">; - +def : CompatRuleStrAttr<"isEqual", "sign-return-address">; +def : CompatRuleStrAttr<"isEqual", "sign-return-address-key">; +def : CompatRuleStrAttr<"isEqual", "branch-protection-pauth-lr">; class MergeRule { // The name of the function called to merge the attributes of the caller and diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index fd5160209506f..19076771ff2ea 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) { Callee.getFnAttribute(AttrClass::getKind()); } +static bool isEqual(const Function &Caller, const Function &Callee, + const StringRef &AttrName) { + return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName); +} + /// Compute the logical AND of the attributes of the caller and the /// callee. /// diff --git a/llvm/test/Transforms/Inline/inline-sign-return-address.ll b/llvm/test/Transforms/Inline/inline-sign-return-address.ll new file mode 100644 index 0000000000000..c4d85fa671a4f --- /dev/null +++ b/llvm/test/Transforms/Inline/inline-sign-return-address.ll @@ -0,0 +1,104 @@ +; Check the inliner doesn't inline a function with different sign return address schemes. +; RUN: opt < %s -passes=inline -S | FileCheck %s + +define internal void @foo_all() #0 { + ret void +} + +define internal void @foo_nonleaf() #1 { + ret void +} + +define internal void @foo_none() #2 { + ret void +} + +define internal void @foo_lr() #3 { + ret void +} + +define internal void @foo_bkey() #4 { + ret void +} + +define dso_local void @bar_all() #0 { +; CHECK-LABEL: bar_all +; CHECK-NOT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_nonleaf() #1 { +; CHECK-LABEL: bar_nonleaf +; CHECK-NEXT: call void @foo_all() +; CHECK-NOT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_none() #2 { +; CHECK-LABEL: bar_none +; CHECK-NEXT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NOT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_lr() #3 { +; CHECK-LABEL: bar_lr +; CHECK-NEXT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NOT: call void @foo_lr() +; CHECK-NEXT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + +define dso_local void @bar_bkey() #4 { +; CHECK-LABEL: bar_bkey +; CHECK-NEXT: call void @foo_all() +; CHECK-NEXT: call void @foo_nonleaf() +; CHECK-NEXT: call void @foo_none() +; CHECK-NEXT: call void @foo_lr() +; CHECK-NOT: call void @foo_bkey() + call void @foo_all() + call void @foo_nonleaf() + call void @foo_none() + call void @foo_lr() + call void @foo_bkey() + ret void +} + + +attributes #0 = { "branch-protection-pauth-lr"="false" "sign-return-address"="all" } +attributes #1 = { "branch-protection-pauth-lr"="false" "sign-return-address"="non-leaf" } +attributes #2 = { "branch-protection-pauth-lr"="false" "sign-return-address"="none" } +attributes #3 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" } +attributes #4 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" } \ No newline at end of file diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp index 474042a3e9a33..db3c4decccb4c 100644 --- a/llvm/utils/TableGen/Attributes.cpp +++ b/llvm/utils/TableGen/Attributes.cpp @@ -87,7 +87,11 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) { for (auto *Rule : CompatRules) { StringRef FuncName = Rule->getValueAsString("CompatFunc"); - OS << " Ret &= " << FuncName << "(Caller, Callee);\n"; + OS << " Ret &= " << FuncName << "(Caller, Callee"; + StringRef AttrName = Rule->getValueAsString("AttrName"); + if (!AttrName.empty()) + OS << ", \"" << AttrName << "\""; + OS << ");\n"; } OS << "\n"; From 5ca877591e65acf18b5a8d3234ff88b215b4f369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Fri, 23 Feb 2024 09:35:38 +0100 Subject: [PATCH 315/351] [clang][analyzer] Fix argument invalidations in StreamChecker. (#79470) Specific arguments passed to stream handling functions are changed by the function, this means these should be invalidated ("escaped") by the analyzer. This change adds the argument invalidation (in specific cases) to the checker. --- .../StaticAnalyzer/Checkers/StreamChecker.cpp | 39 ++++- clang/test/Analysis/stream-invalidate.c | 147 ++++++++++++++++++ 2 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 clang/test/Analysis/stream-invalidate.c diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp index a070f451694a3..65bdc4cac3094 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp @@ -21,6 +21,7 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h" #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h" +#include "llvm/ADT/Sequence.h" #include #include @@ -629,6 +630,21 @@ const ExplodedNode *StreamChecker::getAcquisitionSite(const ExplodedNode *N, return nullptr; } +static ProgramStateRef escapeArgs(ProgramStateRef State, CheckerContext &C, + const CallEvent &Call, + ArrayRef EscapingArgs) { + const auto *CE = Call.getOriginExpr(); + + SmallVector EscapingVals; + EscapingVals.reserve(EscapingArgs.size()); + for (auto EscArgIdx : EscapingArgs) + EscapingVals.push_back(Call.getArgSVal(EscArgIdx)); + State = State->invalidateRegions(EscapingVals, CE, C.blockCount(), + C.getLocationContext(), + /*CausesPointerEscape=*/false); + return State; +} + //===----------------------------------------------------------------------===// // Methods of StreamChecker. //===----------------------------------------------------------------------===// @@ -819,6 +835,11 @@ void StreamChecker::evalFreadFwrite(const FnDescription *Desc, return; } + // At read, invalidate the buffer in any case of error or success, + // except if EOF was already present. + if (IsFread && !E.isStreamEof()) + State = escapeArgs(State, C, Call, {0}); + // Generate a transition for the success state. // If we know the state to be FEOF at fread, do not add a success state. if (!IsFread || !E.isStreamEof()) { @@ -863,6 +884,9 @@ void StreamChecker::evalFgetx(const FnDescription *Desc, const CallEvent &Call, return; if (!E.isStreamEof()) { + // If there was already EOF, assume that read buffer is not changed. + // Otherwise it may change at success or failure. + State = escapeArgs(State, C, Call, {0}); if (SingleChar) { // Generate a transition for the success state of `fgetc`. NonLoc RetVal = makeRetVal(C, E.CE).castAs(); @@ -1011,6 +1035,14 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call, State->BindExpr(E.CE, C.getLocationContext(), RetVal); StateNotFailed = E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, E.getZeroVal(Call)); + if (!StateNotFailed) + return; + + SmallVector EscArgs; + for (auto EscArg : llvm::seq(2u, Call.getNumArgs())) + EscArgs.push_back(EscArg); + StateNotFailed = escapeArgs(StateNotFailed, C, Call, EscArgs); + if (StateNotFailed) C.addTransition(StateNotFailed); } @@ -1073,8 +1105,12 @@ void StreamChecker::evalGetdelim(const FnDescription *Desc, // return -1. // If an error occurs, the function shall return -1 and set 'errno'. - // Add transition for the successful state. if (!E.isStreamEof()) { + // Escape buffer and size (may change by the call). + // May happen even at error (partial read?). + State = escapeArgs(State, C, Call, {0, 1}); + + // Add transition for the successful state. NonLoc RetVal = makeRetVal(C, E.CE).castAs(); ProgramStateRef StateNotFailed = State->BindExpr(E.CE, C.getLocationContext(), RetVal); @@ -1161,6 +1197,7 @@ void StreamChecker::evalFgetpos(const FnDescription *Desc, ProgramStateRef StateNotFailed, StateFailed; std::tie(StateFailed, StateNotFailed) = E.makeRetValAndAssumeDual(State, C); + StateNotFailed = escapeArgs(StateNotFailed, C, Call, {1}); // This function does not affect the stream state. // Still we add success and failure state with the appropriate return value. diff --git a/clang/test/Analysis/stream-invalidate.c b/clang/test/Analysis/stream-invalidate.c new file mode 100644 index 0000000000000..6745d11a2fe70 --- /dev/null +++ b/clang/test/Analysis/stream-invalidate.c @@ -0,0 +1,147 @@ +// RUN: %clang_analyze_cc1 -verify %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=alpha.unix.Stream \ +// RUN: -analyzer-checker=debug.ExprInspection + +#include "Inputs/system-header-simulator.h" + +void clang_analyzer_eval(int); +void clang_analyzer_dump(int); + +void test_fread(void) { + FILE *F = fopen("file", "r+"); + if (!F) + return; + + char Buf[3] = {10, 10, 10}; + fread(Buf, 1, 3, F); + // The check applies to success and failure. + clang_analyzer_dump(Buf[0]); // expected-warning {{conj_$}} Should not preserve the previous value, thus should not be 10. + clang_analyzer_dump(Buf[2]); // expected-warning {{conj_$}} + if (feof(F)) { + char Buf1[3] = {10, 10, 10}; + fread(Buf1, 1, 3, F); // expected-warning {{is in EOF state}} + clang_analyzer_dump(Buf1[0]); // expected-warning {{10 S32b}} + clang_analyzer_dump(Buf1[2]); // expected-warning {{10 S32b}} + } + + fclose(F); +} + +void test_fwrite(void) { + FILE *F = fopen("file", "r+"); + if (!F) + return; + + char Buf[3] = {10, 10, 10}; + fwrite(Buf, 1, 3, F); + // The check applies to success and failure. + clang_analyzer_dump(Buf[0]); // expected-warning {{10 S32b}} + clang_analyzer_dump(Buf[2]); // expected-warning {{10 S32b}} + + fclose(F); +} + +void test_fgets() { + FILE *F = tmpfile(); + if (!F) + return; + + char Buf[3] = {10, 10, 10}; + fgets(Buf, 3, F); + // The check applies to success and failure. + clang_analyzer_dump(Buf[0]); // expected-warning {{conj_$}} Should not preserve the previous value, thus should not be 10. + clang_analyzer_dump(Buf[2]); // expected-warning {{conj_$}} + if (feof(F)) { + char Buf1[3] = {10, 10, 10}; + fgets(Buf1, 3, F); // expected-warning {{is in EOF state}} + clang_analyzer_dump(Buf1[0]); // expected-warning {{10 S32b}} + clang_analyzer_dump(Buf1[2]); // expected-warning {{10 S32b}} + } + + fclose(F); +} + +void test_fputs() { + FILE *F = tmpfile(); + if (!F) + return; + + char *Buf = "aaa"; + fputs(Buf, F); + // The check applies to success and failure. + clang_analyzer_dump(Buf[0]); // expected-warning {{97 S32b}} + clang_analyzer_dump(Buf[2]); // expected-warning {{97 S32b}} + clang_analyzer_dump(Buf[3]); // expected-warning {{0 S32b}} + + fclose(F); +} + +void test_fscanf() { + FILE *F = tmpfile(); + if (!F) + return; + + int a = 1; + unsigned b; + int Ret = fscanf(F, "%d %u", &a, &b); + if (Ret == 0) { + clang_analyzer_dump(a); // expected-warning {{conj_$}} + // FIXME: should be {{1 S32b}}. + clang_analyzer_dump(b); // expected-warning {{conj_$}} + // FIXME: should be {{uninitialized value}}. + } else if (Ret == 1) { + clang_analyzer_dump(a); // expected-warning {{conj_$}} + clang_analyzer_dump(b); // expected-warning {{conj_$}} + // FIXME: should be {{uninitialized value}}. + } else if (Ret >= 2) { + clang_analyzer_dump(a); // expected-warning {{conj_$}} + clang_analyzer_dump(b); // expected-warning {{conj_$}} + clang_analyzer_eval(Ret == 2); // expected-warning {{FALSE}} expected-warning {{TRUE}} + // FIXME: should be only TRUE. + } else { + clang_analyzer_dump(a); // expected-warning {{1 S32b}} + clang_analyzer_dump(b); // expected-warning {{uninitialized value}} + } + + fclose(F); +} + +void test_getdelim(char *P, size_t Sz) { + FILE *F = tmpfile(); + if (!F) + return; + + char *P1 = P; + size_t Sz1 = Sz; + ssize_t Ret = getdelim(&P, &Sz, '\t', F); + if (Ret < 0) { + clang_analyzer_eval(P == P1); // expected-warning {{FALSE}} \ + // expected-warning {{TRUE}} + clang_analyzer_eval(Sz == Sz1); // expected-warning {{FALSE}} \ + // expected-warning {{TRUE}} + } else { + clang_analyzer_eval(P == P1); // expected-warning {{FALSE}} \ + // expected-warning {{TRUE}} + clang_analyzer_eval(Sz == Sz1); // expected-warning {{FALSE}} \ + // expected-warning {{TRUE}} + } + + fclose(F); +} + +void test_fgetpos() { + FILE *F = tmpfile(); + if (!F) + return; + + fpos_t Pos = 1; + int Ret = fgetpos(F, &Pos); + if (Ret == 0) { + clang_analyzer_dump(Pos); // expected-warning {{conj_$}} + } else { + clang_analyzer_dump(Pos); // expected-warning {{1 S32b}} + } + + fclose(F); +} From d68d29516102252f6bf6dc23fb22cef144ca1cb3 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Feb 2024 09:48:13 +0100 Subject: [PATCH 316/351] [mlir][Transforms][NFC] Turn op/block arg replacements into `IRRewrite`s (#81757) This commit is a refactoring of the dialect conversion. The dialect conversion maintains a list of "IR rewrites" that can be committed (upon success) or rolled back (upon failure). Until now, op replacements and block argument replacements were kept track in separate data structures inside the dialect conversion. This commit turns them into `IRRewrite`s, so that they can be committed or rolled back just like any other rewrite. This simplifies the internal state of the dialect conversion. Overview of changes: * Add two new rewrite classes: `ReplaceBlockArgRewrite` and `ReplaceOperationRewrite`. Remove the `OpReplacement` helper class; it is now part of `ReplaceOperationRewrite`. * Simplify `RewriterState`: `numReplacements` and `numArgReplacements` are no longer needed. (Now being kept track of by `numRewrites`.) * Add `IRRewrite::cleanup`. Operations should not be erased in `commit` because they may still be referenced in other internal state of the dialect conversion (`mapping`). Detaching operations is fine. * `trackedOps` are now updated during the "commit" phase instead of after applying all rewrites. --- .../Transforms/Utils/DialectConversion.cpp | 297 +++++++++--------- 1 file changed, 157 insertions(+), 140 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index db41b9f19e7e8..dec68048dc1d3 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -153,14 +153,12 @@ namespace { /// This is useful when saving and undoing a set of rewrites. struct RewriterState { RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations, - unsigned numReplacements, unsigned numArgReplacements, unsigned numRewrites, unsigned numIgnoredOperations, unsigned numErased) : numCreatedOps(numCreatedOps), numUnresolvedMaterializations(numUnresolvedMaterializations), - numReplacements(numReplacements), - numArgReplacements(numArgReplacements), numRewrites(numRewrites), - numIgnoredOperations(numIgnoredOperations), numErased(numErased) {} + numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations), + numErased(numErased) {} /// The current number of created operations. unsigned numCreatedOps; @@ -168,12 +166,6 @@ struct RewriterState { /// The current number of unresolved materializations. unsigned numUnresolvedMaterializations; - /// The current number of replacements queued. - unsigned numReplacements; - - /// The current number of argument replacements queued. - unsigned numArgReplacements; - /// The current number of rewrites performed. unsigned numRewrites; @@ -184,20 +176,6 @@ struct RewriterState { unsigned numErased; }; -//===----------------------------------------------------------------------===// -// OpReplacement - -/// This class represents one requested operation replacement via 'replaceOp' or -/// 'eraseOp`. -struct OpReplacement { - OpReplacement(const TypeConverter *converter = nullptr) - : converter(converter) {} - - /// An optional type converter that can be used to materialize conversions - /// between the new and old values if necessary. - const TypeConverter *converter; -}; - //===----------------------------------------------------------------------===// // UnresolvedMaterialization @@ -321,19 +299,27 @@ class IRRewrite { MoveBlock, SplitBlock, BlockTypeConversion, + ReplaceBlockArg, // Operation rewrites MoveOperation, - ModifyOperation + ModifyOperation, + ReplaceOperation }; virtual ~IRRewrite() = default; - /// Roll back the rewrite. + /// Roll back the rewrite. Operations may be erased during rollback. virtual void rollback() = 0; - /// Commit the rewrite. + /// Commit the rewrite. Operations may be unlinked from their blocks during + /// the commit phase, but they must not be erased yet. This is because + /// internal dialect conversion state (such as `mapping`) may still be using + /// them. Operations must be erased during cleanup. virtual void commit() {} + /// Cleanup operations. Cleanup is called after commit. + virtual void cleanup() {} + Kind getKind() const { return kind; } static bool classof(const IRRewrite *rewrite) { return true; } @@ -360,7 +346,7 @@ class BlockRewrite : public IRRewrite { static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() >= Kind::CreateBlock && - rewrite->getKind() <= Kind::BlockTypeConversion; + rewrite->getKind() <= Kind::ReplaceBlockArg; } protected: @@ -428,6 +414,8 @@ class EraseBlockRewrite : public BlockRewrite { void commit() override { // Erase the block. assert(block && "expected block"); + assert(block->empty() && "expected empty block"); + block->dropAllDefinedValueUses(); delete block; block = nullptr; } @@ -589,6 +577,27 @@ class BlockTypeConversionRewrite : public BlockRewrite { const TypeConverter *converter; }; +/// Replacing a block argument. This rewrite is not immediately reflected in the +/// IR. An internal IR mapping is updated, but the actual replacement is delayed +/// until the rewrite is committed. +class ReplaceBlockArgRewrite : public BlockRewrite { +public: + ReplaceBlockArgRewrite(ConversionPatternRewriterImpl &rewriterImpl, + Block *block, BlockArgument arg) + : BlockRewrite(Kind::ReplaceBlockArg, rewriterImpl, block), arg(arg) {} + + static bool classof(const IRRewrite *rewrite) { + return rewrite->getKind() == Kind::ReplaceBlockArg; + } + + void commit() override; + + void rollback() override; + +private: + BlockArgument arg; +}; + /// An operation rewrite. class OperationRewrite : public IRRewrite { public: @@ -597,7 +606,7 @@ class OperationRewrite : public IRRewrite { static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() >= Kind::MoveOperation && - rewrite->getKind() <= Kind::ModifyOperation; + rewrite->getKind() <= Kind::ReplaceOperation; } protected: @@ -698,6 +707,39 @@ class ModifyOperationRewrite : public OperationRewrite { SmallVector successors; void *propertiesStorage = nullptr; }; + +/// Replacing an operation. Erasing an operation is treated as a special case +/// with "null" replacements. This rewrite is not immediately reflected in the +/// IR. An internal IR mapping is updated, but values are not replaced and the +/// original op is not erased until the rewrite is committed. +class ReplaceOperationRewrite : public OperationRewrite { +public: + ReplaceOperationRewrite(ConversionPatternRewriterImpl &rewriterImpl, + Operation *op, const TypeConverter *converter, + bool changedResults) + : OperationRewrite(Kind::ReplaceOperation, rewriterImpl, op), + converter(converter), changedResults(changedResults) {} + + static bool classof(const IRRewrite *rewrite) { + return rewrite->getKind() == Kind::ReplaceOperation; + } + + void commit() override; + + void rollback() override; + + void cleanup() override; + +private: + friend struct OperationConverter; + + /// An optional type converter that can be used to materialize conversions + /// between the new and old values if necessary. + const TypeConverter *converter; + + /// A boolean flag that indicates whether result types have changed or not. + bool changedResults; +}; } // namespace /// Return "true" if there is an operation rewrite that matches the specified @@ -890,6 +932,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { void eraseBlock(Block *block) override { if (erased.contains(block)) return; + assert(block->empty() && "expected empty block"); block->dropAllDefinedValueUses(); RewriterBase::eraseBlock(block); } @@ -921,12 +964,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// conversion. SmallVector unresolvedMaterializations; - /// Ordered map of requested operation replacements. - llvm::MapVector replacements; - - /// Ordered vector of any requested block argument replacements. - SmallVector argReplacements; - /// Ordered list of block operations (creations, splits, motions). SmallVector> rewrites; @@ -941,11 +978,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// operation was ignored. SetVector ignoredOps; - /// A vector of indices into `replacements` of operations that were replaced - /// with values with different result types than the original operation, e.g. - /// 1->N conversion of some kind. - SmallVector operationsWithChangedResults; - /// The current type converter, or nullptr if no type converter is currently /// active. const TypeConverter *currentTypeConverter = nullptr; @@ -957,6 +989,12 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// This allows the user to collect the match failure message. function_ref notifyCallback; + /// A set of pre-existing operations. When mode == OpConversionMode::Analysis, + /// this is populated with ops found to be legalizable to the target. + /// When mode == OpConversionMode::Partial, this is populated with ops found + /// *not* to be legalizable to the target. + DenseSet *trackedOps = nullptr; + #ifndef NDEBUG /// A set of operations that have pending updates. This tracking isn't /// strictly necessary, and is thus only active during debug builds for extra @@ -1001,6 +1039,8 @@ void BlockTypeConversionRewrite::commit() { } } + assert(origBlock->empty() && "expected empty block"); + origBlock->dropAllDefinedValueUses(); delete origBlock; origBlock = nullptr; } @@ -1063,6 +1103,47 @@ LogicalResult BlockTypeConversionRewrite::materializeLiveConversions( return success(); } +void ReplaceBlockArgRewrite::commit() { + Value repl = rewriterImpl.mapping.lookupOrNull(arg, arg.getType()); + if (!repl) + return; + + if (isa(repl)) { + arg.replaceAllUsesWith(repl); + return; + } + + // If the replacement value is an operation, we check to make sure that we + // don't replace uses that are within the parent operation of the + // replacement value. + Operation *replOp = cast(repl).getOwner(); + Block *replBlock = replOp->getBlock(); + arg.replaceUsesWithIf(repl, [&](OpOperand &operand) { + Operation *user = operand.getOwner(); + return user->getBlock() != replBlock || replOp->isBeforeInBlock(user); + }); +} + +void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); } + +void ReplaceOperationRewrite::commit() { + for (OpResult result : op->getResults()) + if (Value newValue = + rewriterImpl.mapping.lookupOrNull(result, result.getType())) + result.replaceAllUsesWith(newValue); + if (rewriterImpl.trackedOps) + rewriterImpl.trackedOps->erase(op); + // Do not erase the operation yet. It may still be referenced in `mapping`. + op->getBlock()->getOperations().remove(op); +} + +void ReplaceOperationRewrite::rollback() { + for (auto result : op->getResults()) + rewriterImpl.mapping.erase(result); +} + +void ReplaceOperationRewrite::cleanup() { eraseOp(op); } + void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) { for (Region ®ion : op->getRegions()) { for (Block &block : region.getBlocks()) { @@ -1085,51 +1166,16 @@ void ConversionPatternRewriterImpl::discardRewrites() { } void ConversionPatternRewriterImpl::applyRewrites() { - // Apply all of the rewrites replacements requested during conversion. - for (auto &repl : replacements) { - for (OpResult result : repl.first->getResults()) - if (Value newValue = mapping.lookupOrNull(result, result.getType())) - result.replaceAllUsesWith(newValue); - } - - // Apply all of the requested argument replacements. - for (BlockArgument arg : argReplacements) { - Value repl = mapping.lookupOrNull(arg, arg.getType()); - if (!repl) - continue; - - if (isa(repl)) { - arg.replaceAllUsesWith(repl); - continue; - } - - // If the replacement value is an operation, we check to make sure that we - // don't replace uses that are within the parent operation of the - // replacement value. - Operation *replOp = cast(repl).getOwner(); - Block *replBlock = replOp->getBlock(); - arg.replaceUsesWithIf(repl, [&](OpOperand &operand) { - Operation *user = operand.getOwner(); - return user->getBlock() != replBlock || replOp->isBeforeInBlock(user); - }); - } + // Commit all rewrites. + for (auto &rewrite : rewrites) + rewrite->commit(); + for (auto &rewrite : rewrites) + rewrite->cleanup(); // Drop all of the unresolved materialization operations created during // conversion. for (auto &mat : unresolvedMaterializations) eraseRewriter.eraseOp(mat.getOp()); - - // In a second pass, erase all of the replaced operations in reverse. This - // allows processing nested operations before their parent region is - // destroyed. Because we process in reverse order, producers may be deleted - // before their users (a pattern deleting a producer and then the consumer) - // so we first drop all uses explicitly. - for (auto &repl : llvm::reverse(replacements)) - eraseRewriter.eraseOp(repl.first); - - // Commit all rewrites. - for (auto &rewrite : rewrites) - rewrite->commit(); } //===----------------------------------------------------------------------===// @@ -1137,28 +1183,14 @@ void ConversionPatternRewriterImpl::applyRewrites() { RewriterState ConversionPatternRewriterImpl::getCurrentState() { return RewriterState(createdOps.size(), unresolvedMaterializations.size(), - replacements.size(), argReplacements.size(), rewrites.size(), ignoredOps.size(), eraseRewriter.erased.size()); } void ConversionPatternRewriterImpl::resetState(RewriterState state) { - // Reset any replaced arguments. - for (BlockArgument replacedArg : - llvm::drop_begin(argReplacements, state.numArgReplacements)) - mapping.erase(replacedArg); - argReplacements.resize(state.numArgReplacements); - // Undo any rewrites. undoRewrites(state.numRewrites); - // Reset any replaced operations and undo any saved mappings. - for (auto &repl : llvm::drop_begin(replacements, state.numReplacements)) - for (auto result : repl.first->getResults()) - mapping.erase(result); - while (replacements.size() != state.numReplacements) - replacements.pop_back(); - // Pop all of the newly inserted materializations. while (unresolvedMaterializations.size() != state.numUnresolvedMaterializations) { @@ -1183,11 +1215,6 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) { while (ignoredOps.size() != state.numIgnoredOperations) ignoredOps.pop_back(); - // Reset operations with changed results. - while (!operationsWithChangedResults.empty() && - operationsWithChangedResults.back() >= state.numReplacements) - operationsWithChangedResults.pop_back(); - while (eraseRewriter.erased.size() != state.numErased) eraseRewriter.erased.pop_back(); } @@ -1256,7 +1283,8 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const { // Check to see if this operation was replaced or its parent ignored. - return replacements.count(op) || ignoredOps.count(op->getParentOp()); + return ignoredOps.count(op->getParentOp()) || + hasRewrite(rewrites, op); } void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) { @@ -1396,7 +1424,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( "invalid to provide a replacement value when the argument isn't " "dropped"); mapping.map(origArg, inputMap->replacementValue); - argReplacements.push_back(origArg); + appendRewrite(block, origArg); continue; } @@ -1430,7 +1458,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( } mapping.map(origArg, newArg); - argReplacements.push_back(origArg); + appendRewrite(block, origArg); argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg); } @@ -1462,7 +1490,12 @@ void ConversionPatternRewriterImpl::notifyOperationInserted( void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op, ValueRange newValues) { assert(newValues.size() == op->getNumResults()); - assert(!replacements.count(op) && "operation was already replaced"); +#ifndef NDEBUG + for (auto &rewrite : rewrites) + if (auto *opReplacement = dyn_cast(rewrite.get())) + assert(opReplacement->getOperation() != op && + "operation was already replaced"); +#endif // NDEBUG // Track if any of the results changed, e.g. erased and replaced with null. bool resultChanged = false; @@ -1477,11 +1510,9 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op, mapping.map(result, newValue); resultChanged |= (newValue.getType() != result.getType()); } - if (resultChanged) - operationsWithChangedResults.push_back(replacements.size()); - // Record the requested operation replacement. - replacements.insert(std::make_pair(op, OpReplacement(currentTypeConverter))); + appendRewrite(op, currentTypeConverter, + resultChanged); // Mark this operation as recursively ignored so that we don't need to // convert any nested operations. @@ -1576,8 +1607,6 @@ void ConversionPatternRewriter::eraseOp(Operation *op) { } void ConversionPatternRewriter::eraseBlock(Block *block) { - impl->notifyBlockIsBeingErased(block); - // Mark all ops for erasure. for (Operation &op : *block) eraseOp(&op); @@ -1586,6 +1615,7 @@ void ConversionPatternRewriter::eraseBlock(Block *block) { // object and will be actually destroyed when rewrites are applied. This // allows us to keep the operations in the block live and undo the removal by // re-inserting the block. + impl->notifyBlockIsBeingErased(block); block->getParent()->getBlocks().remove(block); } @@ -1615,7 +1645,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from, << "'(in region of '" << parentOp->getName() << "'(" << from.getOwner()->getParentOp() << ")\n"; }); - impl->argReplacements.push_back(from); + impl->appendRewrite(from.getOwner(), from); impl->mapping.map(impl->mapping.lookupOrDefault(from), to); } @@ -2039,16 +2069,13 @@ OperationLegalizer::legalizePatternResult(Operation *op, const Pattern &pattern, #ifndef NDEBUG assert(impl.pendingRootUpdates.empty() && "dangling root updates"); - // Check that the root was either replaced or updated in place. + auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites); auto replacedRoot = [&] { - return llvm::any_of( - llvm::drop_begin(impl.replacements, curState.numReplacements), - [op](auto &it) { return it.first == op; }); + return hasRewrite(newRewrites, op); }; auto updatedRootInPlace = [&] { - return hasRewrite( - llvm::drop_begin(impl.rewrites, curState.numRewrites), op); + return hasRewrite(newRewrites, op); }; assert((replacedRoot() || updatedRootInPlace()) && "expected pattern to replace the root operation"); @@ -2081,7 +2108,8 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites( if (!rewrite) continue; Block *block = rewrite->getBlock(); - if (isa(rewrite)) + if (isa(rewrite)) continue; // Only check blocks outside of the current operation. Operation *parentOp = block->getParentOp(); @@ -2476,6 +2504,7 @@ LogicalResult OperationConverter::convertOperations( ConversionPatternRewriter rewriter(ops.front()->getContext()); ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl(); rewriterImpl.notifyCallback = notifyCallback; + rewriterImpl.trackedOps = trackedOps; for (auto *op : toConvert) if (failed(convert(rewriter, op))) @@ -2493,13 +2522,6 @@ LogicalResult OperationConverter::convertOperations( rewriterImpl.discardRewrites(); } else { rewriterImpl.applyRewrites(); - - // It is possible for a later pattern to erase an op that was originally - // identified as illegal and added to the trackedOps, remove it now after - // replacements have been computed. - if (trackedOps) - for (auto &repl : rewriterImpl.replacements) - trackedOps->erase(repl.first); } return success(); } @@ -2513,21 +2535,20 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { failed(legalizeConvertedArgumentTypes(rewriter, rewriterImpl))) return failure(); - if (rewriterImpl.operationsWithChangedResults.empty()) - return success(); - // Process requested operation replacements. - for (unsigned i = 0, e = rewriterImpl.operationsWithChangedResults.size(); - i != e; ++i) { - unsigned replIdx = rewriterImpl.operationsWithChangedResults[i]; - auto &repl = *(rewriterImpl.replacements.begin() + replIdx); - for (OpResult result : repl.first->getResults()) { + for (unsigned i = 0; i < rewriterImpl.rewrites.size(); ++i) { + auto *opReplacement = + dyn_cast(rewriterImpl.rewrites[i].get()); + if (!opReplacement || !opReplacement->changedResults) + continue; + Operation *op = opReplacement->getOperation(); + for (OpResult result : op->getResults()) { Value newValue = rewriterImpl.mapping.lookupOrNull(result); // If the operation result was replaced with null, all of the uses of this // value should be replaced. if (!newValue) { - if (failed(legalizeErasedResult(repl.first, result, rewriterImpl))) + if (failed(legalizeErasedResult(op, result, rewriterImpl))) return failure(); continue; } @@ -2541,15 +2562,11 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { inverseMapping = rewriterImpl.mapping.getInverse(); // Legalize this result. - rewriter.setInsertionPoint(repl.first); - if (failed(legalizeChangedResultType(repl.first, result, newValue, - repl.second.converter, rewriter, + rewriter.setInsertionPoint(op); + if (failed(legalizeChangedResultType(op, result, newValue, + opReplacement->converter, rewriter, rewriterImpl, *inverseMapping))) return failure(); - - // Update the end iterator for this loop in the case it was updated - // when legalizing generated conversion operations. - e = rewriterImpl.operationsWithChangedResults.size(); } } return success(); From b014944e47ba6e2031e968268b15fba43a9e1dbf Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Fri, 23 Feb 2024 16:54:11 +0800 Subject: [PATCH 317/351] [NFC] [doc] Mentioning to include the guard headers from imported modules --- clang/docs/StandardCPlusPlusModules.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst index 0347ff077fdb8..c5478bba45f38 100644 --- a/clang/docs/StandardCPlusPlusModules.rst +++ b/clang/docs/StandardCPlusPlusModules.rst @@ -868,6 +868,9 @@ headers to: ... #endif +If the modules imported by your library provides such headers too, remember to add them to +your ``your_library_imported.h`` too. + Importing modules ~~~~~~~~~~~~~~~~~ From ace83da316fbd2196fa35e8fd90218dcf84a020c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 23 Feb 2024 09:09:45 +0100 Subject: [PATCH 318/351] [clang][Interp][NFC] Improve Program dump()ing Add colors as well as more details for global variables. --- clang/lib/AST/Interp/Descriptor.h | 3 ++ clang/lib/AST/Interp/Disasm.cpp | 71 ++++++++++++++++++++++++++++--- clang/lib/AST/Interp/Program.h | 1 + 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h index ac8707a521e19..0f64d678f3ef6 100644 --- a/clang/lib/AST/Interp/Descriptor.h +++ b/clang/lib/AST/Interp/Descriptor.h @@ -213,6 +213,9 @@ struct Descriptor final { bool isRecord() const { return !IsArray && ElemRecord; } /// Checks if this is a dummy descriptor. bool isDummy() const { return IsDummy; } + + void dump() const; + void dump(llvm::raw_ostream &OS) const; }; /// Bitfield tracking the initialisation status of elements of primitive arrays. diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index eba437e05f59d..3bc9312debeb7 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -16,6 +16,7 @@ #include "Opcode.h" #include "PrimType.h" #include "Program.h" +#include "clang/AST/ASTDumperUtils.h" #include "clang/AST/DeclCXX.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Format.h" @@ -55,7 +56,10 @@ inline IntegralAP ReadArg>(Program &P, CodePtr &OpPC) { LLVM_DUMP_METHOD void Function::dump() const { dump(llvm::errs()); } LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const { - OS << getName() << " " << (const void *)this << "\n"; + { + ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_GREEN, true}); + OS << getName() << " " << (const void *)this << "\n"; + } OS << "frame size: " << getFrameSize() << "\n"; OS << "arg size: " << getArgSize() << "\n"; OS << "rvo: " << hasRVO() << "\n"; @@ -83,14 +87,67 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const { LLVM_DUMP_METHOD void Program::dump() const { dump(llvm::errs()); } LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const { - OS << ":: Program\n"; - OS << "Global Variables: " << Globals.size() << "\n"; - OS << "Functions: " << Funcs.size() << "\n"; - OS << "\n"; - for (auto &Func : Funcs) { + { + ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true}); + OS << "\n:: Program\n"; + } + + { + ColorScope SC(OS, true, {llvm::raw_ostream::WHITE, true}); + OS << "Total memory : " << Allocator.getTotalMemory() << " bytes\n"; + OS << "Global Variables: " << Globals.size() << "\n"; + } + unsigned GI = 0; + for (const Global *G : Globals) { + const Descriptor *Desc = G->block()->getDescriptor(); + OS << GI << ": " << (void *)G->block() << " "; + Desc->dump(OS); + OS << "\n"; + ++GI; + } + + { + ColorScope SC(OS, true, {llvm::raw_ostream::WHITE, true}); + OS << "Functions: " << Funcs.size() << "\n"; + } + for (const auto &Func : Funcs) { Func.second->dump(); } - for (auto &Anon : AnonFuncs) { + for (const auto &Anon : AnonFuncs) { Anon->dump(); } } + +LLVM_DUMP_METHOD void Descriptor::dump() const { + dump(llvm::errs()); + llvm::errs() << '\n'; +} + +LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const { + // Source + { + ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true}); + if (const auto *ND = dyn_cast_if_present(asDecl())) + OS << ND->getName(); + else if (asExpr()) + OS << "expr (TODO)"; + } + + // Print a few interesting bits about the descriptor. + if (isPrimitiveArray()) + OS << " primitive-array"; + else if (isCompositeArray()) + OS << " composite-array"; + else if (isRecord()) + OS << " record"; + else if (isPrimitive()) + OS << " primitive"; + + if (isZeroSizeArray()) + OS << " zero-size-arrary"; + else if (isUnknownSizeArray()) + OS << " unknown-size-array"; + + if (isDummy()) + OS << " dummy"; +} diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h index 364a63dbf477a..7922eafbeb2d0 100644 --- a/clang/lib/AST/Interp/Program.h +++ b/clang/lib/AST/Interp/Program.h @@ -190,6 +190,7 @@ class Program final { std::byte *data() { return B.data(); } /// Return a pointer to the block. Block *block() { return &B; } + const Block *block() const { return &B; } private: /// Required metadata - does not actually track pointers. From 9ca70d72f4f217ff4f6ab337ad4a8e6666860791 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Feb 2024 10:03:26 +0100 Subject: [PATCH 319/351] [mlir][Transforms][NFC] Turn op creation into `IRRewrite` (#81759) This commit is a refactoring of the dialect conversion. The dialect conversion maintains a list of "IR rewrites" that can be committed (upon success) or rolled back (upon failure). Until now, the dialect conversion kept track of "op creation" in separate internal data structures. This commit turns "op creation" into an `IRRewrite` that can be committed and rolled back just like any other rewrite. This commit simplifies the internal state of the dialect conversion. --- .../Transforms/Utils/DialectConversion.cpp | 102 +++++++++++------- 1 file changed, 64 insertions(+), 38 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index dec68048dc1d3..704597148dfac 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -152,17 +152,12 @@ namespace { /// This class contains a snapshot of the current conversion rewriter state. /// This is useful when saving and undoing a set of rewrites. struct RewriterState { - RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations, - unsigned numRewrites, unsigned numIgnoredOperations, - unsigned numErased) - : numCreatedOps(numCreatedOps), - numUnresolvedMaterializations(numUnresolvedMaterializations), + RewriterState(unsigned numUnresolvedMaterializations, unsigned numRewrites, + unsigned numIgnoredOperations, unsigned numErased) + : numUnresolvedMaterializations(numUnresolvedMaterializations), numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations), numErased(numErased) {} - /// The current number of created operations. - unsigned numCreatedOps; - /// The current number of unresolved materializations. unsigned numUnresolvedMaterializations; @@ -303,7 +298,8 @@ class IRRewrite { // Operation rewrites MoveOperation, ModifyOperation, - ReplaceOperation + ReplaceOperation, + CreateOperation }; virtual ~IRRewrite() = default; @@ -376,7 +372,10 @@ class CreateBlockRewrite : public BlockRewrite { auto &blockOps = block->getOperations(); while (!blockOps.empty()) blockOps.remove(blockOps.begin()); - eraseBlock(block); + if (block->getParent()) + eraseBlock(block); + else + delete block; } }; @@ -606,7 +605,7 @@ class OperationRewrite : public IRRewrite { static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() >= Kind::MoveOperation && - rewrite->getKind() <= Kind::ReplaceOperation; + rewrite->getKind() <= Kind::CreateOperation; } protected: @@ -740,6 +739,19 @@ class ReplaceOperationRewrite : public OperationRewrite { /// A boolean flag that indicates whether result types have changed or not. bool changedResults; }; + +class CreateOperationRewrite : public OperationRewrite { +public: + CreateOperationRewrite(ConversionPatternRewriterImpl &rewriterImpl, + Operation *op) + : OperationRewrite(Kind::CreateOperation, rewriterImpl, op) {} + + static bool classof(const IRRewrite *rewrite) { + return rewrite->getKind() == Kind::CreateOperation; + } + + void rollback() override; +}; } // namespace /// Return "true" if there is an operation rewrite that matches the specified @@ -957,9 +969,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { // replacing a value with one of a different type. ConversionValueMapping mapping; - /// Ordered vector of all of the newly created operations during conversion. - SmallVector createdOps; - /// Ordered vector of all unresolved type conversion materializations during /// conversion. SmallVector unresolvedMaterializations; @@ -1144,6 +1153,15 @@ void ReplaceOperationRewrite::rollback() { void ReplaceOperationRewrite::cleanup() { eraseOp(op); } +void CreateOperationRewrite::rollback() { + for (Region ®ion : op->getRegions()) { + while (!region.getBlocks().empty()) + region.getBlocks().remove(region.getBlocks().begin()); + } + op->dropAllUses(); + eraseOp(op); +} + void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) { for (Region ®ion : op->getRegions()) { for (Block &block : region.getBlocks()) { @@ -1161,8 +1179,6 @@ void ConversionPatternRewriterImpl::discardRewrites() { // Remove any newly created ops. for (UnresolvedMaterialization &materialization : unresolvedMaterializations) detachNestedAndErase(materialization.getOp()); - for (auto *op : llvm::reverse(createdOps)) - detachNestedAndErase(op); } void ConversionPatternRewriterImpl::applyRewrites() { @@ -1182,9 +1198,8 @@ void ConversionPatternRewriterImpl::applyRewrites() { // State Management RewriterState ConversionPatternRewriterImpl::getCurrentState() { - return RewriterState(createdOps.size(), unresolvedMaterializations.size(), - rewrites.size(), ignoredOps.size(), - eraseRewriter.erased.size()); + return RewriterState(unresolvedMaterializations.size(), rewrites.size(), + ignoredOps.size(), eraseRewriter.erased.size()); } void ConversionPatternRewriterImpl::resetState(RewriterState state) { @@ -1205,12 +1220,6 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) { detachNestedAndErase(op); } - // Pop all of the newly created operations. - while (createdOps.size() != state.numCreatedOps) { - detachNestedAndErase(createdOps.back()); - createdOps.pop_back(); - } - // Pop all of the recorded ignored operations that are no longer valid. while (ignoredOps.size() != state.numIgnoredOperations) ignoredOps.pop_back(); @@ -1478,7 +1487,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted( }); if (!previous.isSet()) { // This is a newly created op. - createdOps.push_back(op); + appendRewrite(op); return; } Operation *prevOp = previous.getPoint() == previous.getBlock()->end() @@ -1979,13 +1988,16 @@ OperationLegalizer::legalizeWithFold(Operation *op, rewriter.replaceOp(op, replacementValues); // Recursively legalize any new constant operations. - for (unsigned i = curState.numCreatedOps, e = rewriterImpl.createdOps.size(); + for (unsigned i = curState.numRewrites, e = rewriterImpl.rewrites.size(); i != e; ++i) { - Operation *cstOp = rewriterImpl.createdOps[i]; - if (failed(legalize(cstOp, rewriter))) { + auto *createOp = + dyn_cast(rewriterImpl.rewrites[i].get()); + if (!createOp) + continue; + if (failed(legalize(createOp->getOperation(), rewriter))) { LLVM_DEBUG(logFailure(rewriterImpl.logger, "failed to legalize generated constant '{0}'", - cstOp->getName())); + createOp->getOperation()->getName())); rewriterImpl.resetState(curState); return failure(); } @@ -2132,9 +2144,14 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites( // blocks in regions created by this pattern will already be legalized later // on. If we haven't built the set yet, build it now. if (operationsToIgnore.empty()) { - auto createdOps = ArrayRef(impl.createdOps) - .drop_front(state.numCreatedOps); - operationsToIgnore.insert(createdOps.begin(), createdOps.end()); + for (unsigned i = state.numRewrites, e = impl.rewrites.size(); i != e; + ++i) { + auto *createOp = + dyn_cast(impl.rewrites[i].get()); + if (!createOp) + continue; + operationsToIgnore.insert(createOp->getOperation()); + } } // If this operation should be considered for re-legalization, try it. @@ -2152,8 +2169,11 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites( LogicalResult OperationLegalizer::legalizePatternCreatedOperations( ConversionPatternRewriter &rewriter, ConversionPatternRewriterImpl &impl, RewriterState &state, RewriterState &newState) { - for (int i = state.numCreatedOps, e = newState.numCreatedOps; i != e; ++i) { - Operation *op = impl.createdOps[i]; + for (int i = state.numRewrites, e = newState.numRewrites; i != e; ++i) { + auto *createOp = dyn_cast(impl.rewrites[i].get()); + if (!createOp) + continue; + Operation *op = createOp->getOperation(); if (failed(legalize(op, rewriter))) { LLVM_DEBUG(logFailure(impl.logger, "failed to legalize generated operation '{0}'({1})", @@ -2583,10 +2603,16 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes( }); return liveUserIt == val.user_end() ? nullptr : *liveUserIt; }; - for (auto &r : rewriterImpl.rewrites) - if (auto *rewrite = dyn_cast(r.get())) - if (failed(rewrite->materializeLiveConversions(findLiveUser))) + // Note: `rewrites` may be reallocated as the loop is running. + for (int64_t i = 0; i < static_cast(rewriterImpl.rewrites.size()); + ++i) { + auto &rewrite = rewriterImpl.rewrites[i]; + if (auto *blockTypeConversionRewrite = + dyn_cast(rewrite.get())) + if (failed(blockTypeConversionRewrite->materializeLiveConversions( + findLiveUser))) return failure(); + } return success(); } From 59ff4d131c7d6b3bfcbe8e96cac99c9d8a65bf4e Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Feb 2024 10:15:12 +0100 Subject: [PATCH 320/351] [mlir][Transforms][NFC] Turn unresolved materializations into `IRRewrite`s (#81761) This commit is a refactoring of the dialect conversion. The dialect conversion maintains a list of "IR rewrites" that can be committed (upon success) or rolled back (upon failure). This commit turns the creation of unresolved materializations (`unrealized_conversion_cast`) into `IRRewrite` objects. After this commit, all steps in `applyRewrites` and `discardRewrites` are calls to `IRRewrite::commit` and `IRRewrite::rollback`. --- .../Transforms/Utils/DialectConversion.cpp | 369 +++++++++--------- 1 file changed, 176 insertions(+), 193 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 704597148dfac..635a2cb00f388 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -152,15 +152,11 @@ namespace { /// This class contains a snapshot of the current conversion rewriter state. /// This is useful when saving and undoing a set of rewrites. struct RewriterState { - RewriterState(unsigned numUnresolvedMaterializations, unsigned numRewrites, - unsigned numIgnoredOperations, unsigned numErased) - : numUnresolvedMaterializations(numUnresolvedMaterializations), - numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations), + RewriterState(unsigned numRewrites, unsigned numIgnoredOperations, + unsigned numErased) + : numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations), numErased(numErased) {} - /// The current number of unresolved materializations. - unsigned numUnresolvedMaterializations; - /// The current number of rewrites performed. unsigned numRewrites; @@ -171,109 +167,10 @@ struct RewriterState { unsigned numErased; }; -//===----------------------------------------------------------------------===// -// UnresolvedMaterialization - -/// This class represents an unresolved materialization, i.e. a materialization -/// that was inserted during conversion that needs to be legalized at the end of -/// the conversion process. -class UnresolvedMaterialization { -public: - /// The type of materialization. - enum Kind { - /// This materialization materializes a conversion for an illegal block - /// argument type, to a legal one. - Argument, - - /// This materialization materializes a conversion from an illegal type to a - /// legal one. - Target - }; - - UnresolvedMaterialization(UnrealizedConversionCastOp op = nullptr, - const TypeConverter *converter = nullptr, - Kind kind = Target, Type origOutputType = nullptr) - : op(op), converterAndKind(converter, kind), - origOutputType(origOutputType) {} - - /// Return the temporary conversion operation inserted for this - /// materialization. - UnrealizedConversionCastOp getOp() const { return op; } - - /// Return the type converter of this materialization (which may be null). - const TypeConverter *getConverter() const { - return converterAndKind.getPointer(); - } - - /// Return the kind of this materialization. - Kind getKind() const { return converterAndKind.getInt(); } - - /// Set the kind of this materialization. - void setKind(Kind kind) { converterAndKind.setInt(kind); } - - /// Return the original illegal output type of the input values. - Type getOrigOutputType() const { return origOutputType; } - -private: - /// The unresolved materialization operation created during conversion. - UnrealizedConversionCastOp op; - - /// The corresponding type converter to use when resolving this - /// materialization, and the kind of this materialization. - llvm::PointerIntPair converterAndKind; - - /// The original output type. This is only used for argument conversions. - Type origOutputType; -}; -} // namespace - -/// Build an unresolved materialization operation given an output type and set -/// of input operands. -static Value buildUnresolvedMaterialization( - UnresolvedMaterialization::Kind kind, Block *insertBlock, - Block::iterator insertPt, Location loc, ValueRange inputs, Type outputType, - Type origOutputType, const TypeConverter *converter, - SmallVectorImpl &unresolvedMaterializations) { - // Avoid materializing an unnecessary cast. - if (inputs.size() == 1 && inputs.front().getType() == outputType) - return inputs.front(); - - // Create an unresolved materialization. We use a new OpBuilder to avoid - // tracking the materialization like we do for other operations. - OpBuilder builder(insertBlock, insertPt); - auto convertOp = - builder.create(loc, outputType, inputs); - unresolvedMaterializations.emplace_back(convertOp, converter, kind, - origOutputType); - return convertOp.getResult(0); -} -static Value buildUnresolvedArgumentMaterialization( - PatternRewriter &rewriter, Location loc, ValueRange inputs, - Type origOutputType, Type outputType, const TypeConverter *converter, - SmallVectorImpl &unresolvedMaterializations) { - return buildUnresolvedMaterialization( - UnresolvedMaterialization::Argument, rewriter.getInsertionBlock(), - rewriter.getInsertionPoint(), loc, inputs, outputType, origOutputType, - converter, unresolvedMaterializations); -} -static Value buildUnresolvedTargetMaterialization( - Location loc, Value input, Type outputType, const TypeConverter *converter, - SmallVectorImpl &unresolvedMaterializations) { - Block *insertBlock = input.getParentBlock(); - Block::iterator insertPt = insertBlock->begin(); - if (OpResult inputRes = dyn_cast(input)) - insertPt = ++inputRes.getOwner()->getIterator(); - - return buildUnresolvedMaterialization( - UnresolvedMaterialization::Target, insertBlock, insertPt, loc, input, - outputType, outputType, converter, unresolvedMaterializations); -} - //===----------------------------------------------------------------------===// // IR rewrites //===----------------------------------------------------------------------===// -namespace { /// An IR rewrite that can be committed (upon success) or rolled back (upon /// failure). /// @@ -299,7 +196,8 @@ class IRRewrite { MoveOperation, ModifyOperation, ReplaceOperation, - CreateOperation + CreateOperation, + UnresolvedMaterialization }; virtual ~IRRewrite() = default; @@ -605,7 +503,7 @@ class OperationRewrite : public IRRewrite { static bool classof(const IRRewrite *rewrite) { return rewrite->getKind() >= Kind::MoveOperation && - rewrite->getKind() <= Kind::CreateOperation; + rewrite->getKind() <= Kind::UnresolvedMaterialization; } protected: @@ -752,6 +650,70 @@ class CreateOperationRewrite : public OperationRewrite { void rollback() override; }; + +/// The type of materialization. +enum MaterializationKind { + /// This materialization materializes a conversion for an illegal block + /// argument type, to a legal one. + Argument, + + /// This materialization materializes a conversion from an illegal type to a + /// legal one. + Target +}; + +/// An unresolved materialization, i.e., a "builtin.unrealized_conversion_cast" +/// op. Unresolved materializations are erased at the end of the dialect +/// conversion. +class UnresolvedMaterializationRewrite : public OperationRewrite { +public: + UnresolvedMaterializationRewrite( + ConversionPatternRewriterImpl &rewriterImpl, + UnrealizedConversionCastOp op, const TypeConverter *converter = nullptr, + MaterializationKind kind = MaterializationKind::Target, + Type origOutputType = nullptr) + : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op), + converterAndKind(converter, kind), origOutputType(origOutputType) {} + + static bool classof(const IRRewrite *rewrite) { + return rewrite->getKind() == Kind::UnresolvedMaterialization; + } + + UnrealizedConversionCastOp getOperation() const { + return cast(op); + } + + void rollback() override; + + void cleanup() override; + + /// Return the type converter of this materialization (which may be null). + const TypeConverter *getConverter() const { + return converterAndKind.getPointer(); + } + + /// Return the kind of this materialization. + MaterializationKind getMaterializationKind() const { + return converterAndKind.getInt(); + } + + /// Set the kind of this materialization. + void setMaterializationKind(MaterializationKind kind) { + converterAndKind.setInt(kind); + } + + /// Return the original illegal output type of the input values. + Type getOrigOutputType() const { return origOutputType; } + +private: + /// The corresponding type converter to use when resolving this + /// materialization, and the kind of this materialization. + llvm::PointerIntPair + converterAndKind; + + /// The original output type. This is only used for argument conversions. + Type origOutputType; +}; } // namespace /// Return "true" if there is an operation rewrite that matches the specified @@ -794,14 +756,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { : rewriter(rewriter), eraseRewriter(rewriter.getContext()), notifyCallback(nullptr) {} - /// Cleanup and destroy any generated rewrite operations. This method is - /// invoked when the conversion process fails. - void discardRewrites(); - - /// Apply all requested operation rewrites. This method is invoked when the - /// conversion process succeeds. - void applyRewrites(); - //===--------------------------------------------------------------------===// // State Management //===--------------------------------------------------------------------===// @@ -809,6 +763,10 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Return the current state of the rewriter. RewriterState getCurrentState(); + /// Apply all requested operation rewrites. This method is invoked when the + /// conversion process succeeds. + void applyRewrites(); + /// Reset the state of the rewriter to a previously saved point. void resetState(RewriterState state); @@ -841,17 +799,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// removes them from being considered for legalization. void markNestedOpsIgnored(Operation *op); - /// Detach any operations nested in the given operation from their parent - /// blocks, and erase the given operation. This can be used when the nested - /// operations are scheduled for erasure themselves, so deleting the regions - /// of the given operation together with their content would result in - /// double-free. This happens, for example, when rolling back op creation in - /// the reverse order and if the nested ops were created before the parent op. - /// This function does not need to collect nested ops recursively because it - /// is expected to also be called for each nested op when it is about to be - /// deleted. - void detachNestedAndErase(Operation *op); - //===--------------------------------------------------------------------===// // Type Conversion //===--------------------------------------------------------------------===// @@ -890,6 +837,28 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { Block *block, const TypeConverter *converter, TypeConverter::SignatureConversion &signatureConversion); + //===--------------------------------------------------------------------===// + // Materializations + //===--------------------------------------------------------------------===// + /// Build an unresolved materialization operation given an output type and set + /// of input operands. + Value buildUnresolvedMaterialization(MaterializationKind kind, + Block *insertBlock, + Block::iterator insertPt, Location loc, + ValueRange inputs, Type outputType, + Type origOutputType, + const TypeConverter *converter); + + Value buildUnresolvedArgumentMaterialization(PatternRewriter &rewriter, + Location loc, ValueRange inputs, + Type origOutputType, + Type outputType, + const TypeConverter *converter); + + Value buildUnresolvedTargetMaterialization(Location loc, Value input, + Type outputType, + const TypeConverter *converter); + //===--------------------------------------------------------------------===// // Rewriter Notification Hooks //===--------------------------------------------------------------------===// @@ -969,10 +938,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { // replacing a value with one of a different type. ConversionValueMapping mapping; - /// Ordered vector of all unresolved type conversion materializations during - /// conversion. - SmallVector unresolvedMaterializations; - /// Ordered list of block operations (creations, splits, motions). SmallVector> rewrites; @@ -1162,24 +1127,15 @@ void CreateOperationRewrite::rollback() { eraseOp(op); } -void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) { - for (Region ®ion : op->getRegions()) { - for (Block &block : region.getBlocks()) { - while (!block.getOperations().empty()) - block.getOperations().remove(block.getOperations().begin()); - block.dropAllDefinedValueUses(); - } +void UnresolvedMaterializationRewrite::rollback() { + if (getMaterializationKind() == MaterializationKind::Target) { + for (Value input : op->getOperands()) + rewriterImpl.mapping.erase(input); } - eraseRewriter.eraseOp(op); + eraseOp(op); } -void ConversionPatternRewriterImpl::discardRewrites() { - undoRewrites(); - - // Remove any newly created ops. - for (UnresolvedMaterialization &materialization : unresolvedMaterializations) - detachNestedAndErase(materialization.getOp()); -} +void UnresolvedMaterializationRewrite::cleanup() { eraseOp(op); } void ConversionPatternRewriterImpl::applyRewrites() { // Commit all rewrites. @@ -1187,39 +1143,20 @@ void ConversionPatternRewriterImpl::applyRewrites() { rewrite->commit(); for (auto &rewrite : rewrites) rewrite->cleanup(); - - // Drop all of the unresolved materialization operations created during - // conversion. - for (auto &mat : unresolvedMaterializations) - eraseRewriter.eraseOp(mat.getOp()); } //===----------------------------------------------------------------------===// // State Management RewriterState ConversionPatternRewriterImpl::getCurrentState() { - return RewriterState(unresolvedMaterializations.size(), rewrites.size(), - ignoredOps.size(), eraseRewriter.erased.size()); + return RewriterState(rewrites.size(), ignoredOps.size(), + eraseRewriter.erased.size()); } void ConversionPatternRewriterImpl::resetState(RewriterState state) { // Undo any rewrites. undoRewrites(state.numRewrites); - // Pop all of the newly inserted materializations. - while (unresolvedMaterializations.size() != - state.numUnresolvedMaterializations) { - UnresolvedMaterialization mat = unresolvedMaterializations.pop_back_val(); - UnrealizedConversionCastOp op = mat.getOp(); - - // If this was a target materialization, drop the mapping that was inserted. - if (mat.getKind() == UnresolvedMaterialization::Target) { - for (Value input : op->getOperands()) - mapping.erase(input); - } - detachNestedAndErase(op); - } - // Pop all of the recorded ignored operations that are no longer valid. while (ignoredOps.size() != state.numIgnoredOperations) ignoredOps.pop_back(); @@ -1280,8 +1217,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues( if (currentTypeConverter && desiredType && newOperandType != desiredType) { Location operandLoc = inputLoc ? *inputLoc : operand.getLoc(); Value castValue = buildUnresolvedTargetMaterialization( - operandLoc, newOperand, desiredType, currentTypeConverter, - unresolvedMaterializations); + operandLoc, newOperand, desiredType, currentTypeConverter); mapping.map(mapping.lookupOrDefault(newOperand), castValue); newOperand = castValue; } @@ -1463,7 +1399,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( newArg = buildUnresolvedArgumentMaterialization( rewriter, origArg.getLoc(), replArgs, origOutputType, outputType, - converter, unresolvedMaterializations); + converter); } mapping.map(origArg, newArg); @@ -1476,6 +1412,50 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( return newBlock; } +//===----------------------------------------------------------------------===// +// Materializations +//===----------------------------------------------------------------------===// + +/// Build an unresolved materialization operation given an output type and set +/// of input operands. +Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization( + MaterializationKind kind, Block *insertBlock, Block::iterator insertPt, + Location loc, ValueRange inputs, Type outputType, Type origOutputType, + const TypeConverter *converter) { + // Avoid materializing an unnecessary cast. + if (inputs.size() == 1 && inputs.front().getType() == outputType) + return inputs.front(); + + // Create an unresolved materialization. We use a new OpBuilder to avoid + // tracking the materialization like we do for other operations. + OpBuilder builder(insertBlock, insertPt); + auto convertOp = + builder.create(loc, outputType, inputs); + appendRewrite(convertOp, converter, kind, + origOutputType); + return convertOp.getResult(0); +} +Value ConversionPatternRewriterImpl::buildUnresolvedArgumentMaterialization( + PatternRewriter &rewriter, Location loc, ValueRange inputs, + Type origOutputType, Type outputType, const TypeConverter *converter) { + return buildUnresolvedMaterialization( + MaterializationKind::Argument, rewriter.getInsertionBlock(), + rewriter.getInsertionPoint(), loc, inputs, outputType, origOutputType, + converter); +} +Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization( + Location loc, Value input, Type outputType, + const TypeConverter *converter) { + Block *insertBlock = input.getParentBlock(); + Block::iterator insertPt = insertBlock->begin(); + if (OpResult inputRes = dyn_cast(input)) + insertPt = ++inputRes.getOwner()->getIterator(); + + return buildUnresolvedMaterialization(MaterializationKind::Target, + insertBlock, insertPt, loc, input, + outputType, outputType, converter); +} + //===----------------------------------------------------------------------===// // Rewriter Notification Hooks @@ -2528,18 +2508,18 @@ LogicalResult OperationConverter::convertOperations( for (auto *op : toConvert) if (failed(convert(rewriter, op))) - return rewriterImpl.discardRewrites(), failure(); + return rewriterImpl.undoRewrites(), failure(); // Now that all of the operations have been converted, finalize the conversion // process to ensure any lingering conversion artifacts are cleaned up and // legalized. if (failed(finalize(rewriter))) - return rewriterImpl.discardRewrites(), failure(); + return rewriterImpl.undoRewrites(), failure(); // After a successful conversion, apply rewrites if this is not an analysis // conversion. if (mode == OpConversionMode::Analysis) { - rewriterImpl.discardRewrites(); + rewriterImpl.undoRewrites(); } else { rewriterImpl.applyRewrites(); } @@ -2645,11 +2625,12 @@ replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl, /// Compute all of the unresolved materializations that will persist beyond the /// conversion process, and require inserting a proper user materialization for. static void computeNecessaryMaterializations( - DenseMap &materializationOps, + DenseMap + &materializationOps, ConversionPatternRewriter &rewriter, ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping, - SetVector &necessaryMaterializations) { + SetVector &necessaryMaterializations) { auto isLive = [&](Value value) { auto findFn = [&](Operation *user) { auto matIt = materializationOps.find(user); @@ -2684,14 +2665,17 @@ static void computeNecessaryMaterializations( return Value(); }; - SetVector worklist; - for (auto &mat : rewriterImpl.unresolvedMaterializations) { - materializationOps.try_emplace(mat.getOp(), &mat); - worklist.insert(&mat); + SetVector worklist; + for (auto &rewrite : rewriterImpl.rewrites) { + auto *mat = dyn_cast(rewrite.get()); + if (!mat) + continue; + materializationOps.try_emplace(mat->getOperation(), mat); + worklist.insert(mat); } while (!worklist.empty()) { - UnresolvedMaterialization *mat = worklist.pop_back_val(); - UnrealizedConversionCastOp op = mat->getOp(); + UnresolvedMaterializationRewrite *mat = worklist.pop_back_val(); + UnrealizedConversionCastOp op = mat->getOperation(); // We currently only handle target materializations here. assert(op->getNumResults() == 1 && "unexpected materialization type"); @@ -2733,7 +2717,7 @@ static void computeNecessaryMaterializations( auto isBlockArg = [](Value v) { return isa(v); }; if (llvm::any_of(op->getOperands(), isBlockArg) || llvm::any_of(inverseMapping[op->getResult(0)], isBlockArg)) { - mat->setKind(UnresolvedMaterialization::Argument); + mat->setMaterializationKind(MaterializationKind::Argument); } // If the materialization does not have any live users, we don't need to @@ -2743,7 +2727,7 @@ static void computeNecessaryMaterializations( // value replacement even if the types differ in some cases. When those // patterns are fixed, we can drop the argument special case here. bool isMaterializationLive = isLive(opResult); - if (mat->getKind() == UnresolvedMaterialization::Argument) + if (mat->getMaterializationKind() == MaterializationKind::Argument) isMaterializationLive |= llvm::any_of(inverseMapping[opResult], isLive); if (!isMaterializationLive) continue; @@ -2763,8 +2747,9 @@ static void computeNecessaryMaterializations( /// Legalize the given unresolved materialization. Returns success if the /// materialization was legalized, failure otherise. static LogicalResult legalizeUnresolvedMaterialization( - UnresolvedMaterialization &mat, - DenseMap &materializationOps, + UnresolvedMaterializationRewrite &mat, + DenseMap + &materializationOps, ConversionPatternRewriter &rewriter, ConversionPatternRewriterImpl &rewriterImpl, DenseMap> &inverseMapping) { @@ -2784,7 +2769,7 @@ static LogicalResult legalizeUnresolvedMaterialization( return Value(); }; - UnrealizedConversionCastOp op = mat.getOp(); + UnrealizedConversionCastOp op = mat.getOperation(); if (!rewriterImpl.ignoredOps.insert(op)) return success(); @@ -2834,8 +2819,8 @@ static LogicalResult legalizeUnresolvedMaterialization( rewriter.setInsertionPoint(op); Value newMaterialization; - switch (mat.getKind()) { - case UnresolvedMaterialization::Argument: + switch (mat.getMaterializationKind()) { + case MaterializationKind::Argument: // Try to materialize an argument conversion. // FIXME: The current argument materialization hook expects the original // output type, even though it doesn't use that as the actual output type @@ -2852,7 +2837,7 @@ static LogicalResult legalizeUnresolvedMaterialization( // If an argument materialization failed, fallback to trying a target // materialization. [[fallthrough]]; - case UnresolvedMaterialization::Target: + case MaterializationKind::Target: newMaterialization = converter->materializeTargetConversion( rewriter, op->getLoc(), outputType, inputOperands); break; @@ -2880,14 +2865,12 @@ LogicalResult OperationConverter::legalizeUnresolvedMaterializations( ConversionPatternRewriter &rewriter, ConversionPatternRewriterImpl &rewriterImpl, std::optional>> &inverseMapping) { - if (rewriterImpl.unresolvedMaterializations.empty()) - return success(); inverseMapping = rewriterImpl.mapping.getInverse(); // As an initial step, compute all of the inserted materializations that we // expect to persist beyond the conversion process. - DenseMap materializationOps; - SetVector necessaryMaterializations; + DenseMap materializationOps; + SetVector necessaryMaterializations; computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl, *inverseMapping, necessaryMaterializations); From b13c8e5099ec7886fcd198b1f6aec14f928c963c Mon Sep 17 00:00:00 2001 From: Daniel Kiss Date: Fri, 23 Feb 2024 10:20:54 +0100 Subject: [PATCH 321/351] Revert "[llvm][AArch64] Autoupgrade function attributes from Module attributes. (#80640)" This reverts commit 531e8c26b3f2626e7f1a997e0e8b61d67d10aded. --- llvm/include/llvm/IR/AutoUpgrade.h | 3 +- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/IR/AutoUpgrade.cpp | 72 +------------------ llvm/lib/Linker/IRMover.cpp | 4 -- .../test/Bitcode/upgrade-arc-runtime-calls.ll | 4 +- .../AArch64/link-branch-target-enforcement.ll | 1 - .../LTO/AArch64/link-sign-return-address.ll | 43 ----------- llvm/test/Linker/link-arm-and-thumb.ll | 7 +- 8 files changed, 8 insertions(+), 128 deletions(-) delete mode 100644 llvm/test/LTO/AArch64/link-sign-return-address.ll diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h index c0d96efc54752..152f781ffa9b3 100644 --- a/llvm/include/llvm/IR/AutoUpgrade.h +++ b/llvm/include/llvm/IR/AutoUpgrade.h @@ -67,8 +67,7 @@ namespace llvm { void UpgradeSectionAttributes(Module &M); /// Correct any IR that is relying on old function attribute behavior. - void UpgradeFunctionAttributes(Function &F, - bool ModuleMetadataIsMaterialized = false); + void UpgradeFunctionAttributes(Function &F); /// If the given TBAA tag uses the scalar TBAA format, create a new node /// corresponding to the upgrade to the struct-path aware TBAA format. diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 8c860101afa02..832907a3f53f5 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -6706,7 +6706,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) { } // Look for functions that rely on old function attribute behavior. - UpgradeFunctionAttributes(*F, true); + UpgradeFunctionAttributes(*F); // Bring in any functions that this function forward-referenced via // blockaddresses. diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index edff13c796b31..b90bbe71ac189 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5155,46 +5155,7 @@ struct StrictFPUpgradeVisitor : public InstVisitor { }; } // namespace -// Check if the module attribute is present and not zero. -static bool isModuleAttributeSet(const Module *M, const StringRef &ModAttr) { - const auto *Attr = - mdconst::extract_or_null(M->getModuleFlag(ModAttr)); - return Attr && Attr->getZExtValue(); -} - -// Copy an attribute from module to the function if exists. -// First value of the pair is used when the module attribute is not zero -// the second otherwise. -static void -CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName, - StringRef ModAttrName, - std::pair Values) { - if (F.hasFnAttribute(FnAttrName)) - return; - F.addFnAttr(FnAttrName, isModuleAttributeSet(F.getParent(), ModAttrName) - ? Values.first - : Values.second); -} - -// Copy a boolean attribute from module to the function if exists. -// Module attribute treated false if zero otherwise true. -static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) { - CopyModuleAttributeToFunction( - F, AttrName, AttrName, - std::make_pair("true", "false")); -} - -// Copy an attribute from module to the function if exists. -// First value of the pair is used when the module attribute is not zero -// the second otherwise. -static void -CopyModuleAttributeToFunction(Function &F, StringRef AttrName, - std::pair Values) { - CopyModuleAttributeToFunction(F, AttrName, AttrName, Values); -} - -void llvm::UpgradeFunctionAttributes(Function &F, - bool ModuleMetadataIsMaterialized) { +void llvm::UpgradeFunctionAttributes(Function &F) { // If a function definition doesn't have the strictfp attribute, // convert any callsite strictfp attributes to nobuiltin. if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) { @@ -5206,37 +5167,6 @@ void llvm::UpgradeFunctionAttributes(Function &F, F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType())); for (auto &Arg : F.args()) Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType())); - - if (!ModuleMetadataIsMaterialized) - return; - if (F.isDeclaration()) - return; - Module *M = F.getParent(); - if (!M) - return; - - Triple T(M->getTargetTriple()); - // Convert module level attributes to function level attributes because - // after merging modules the attributes might change and would have different - // effect on the functions as the original module would have. - if (T.isThumb() || T.isARM() || T.isAArch64()) { - if (!F.hasFnAttribute("sign-return-address")) { - StringRef SignType = "none"; - if (isModuleAttributeSet(M, "sign-return-address")) - SignType = "non-leaf"; - - if (isModuleAttributeSet(M, "sign-return-address-all")) - SignType = "all"; - - F.addFnAttr("sign-return-address", SignType); - } - CopyModuleAttributeToFunction(F, "branch-target-enforcement"); - CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr"); - CopyModuleAttributeToFunction(F, "guarded-control-stack"); - CopyModuleAttributeToFunction( - F, "sign-return-address-key", - std::make_pair("b_key", "a_key")); - } } static bool isOldLoopArgument(Metadata *MD) { diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 9f45ebc6eda01..37d21119447b9 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1606,10 +1606,6 @@ Error IRLinker::run() { // Loop over all of the linked values to compute type mappings. computeTypeMapping(); - // Update function attributes before copying them to destation module. - for (Function &F : SrcM->getFunctionList()) - UpgradeFunctionAttributes(F, true); - std::reverse(Worklist.begin(), Worklist.end()); while (!Worklist.empty()) { GlobalValue *GV = Worklist.back(); diff --git a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll index d2edec18d55e5..19f25f98953fa 100644 --- a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll +++ b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll @@ -55,7 +55,7 @@ unwindBlock: // Check that auto-upgrader converts function calls to intrinsic calls. Note that // the auto-upgrader doesn't touch invoke instructions. -// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality +// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality // ARC: %[[V0:.*]] = tail call ptr @llvm.objc.autorelease(ptr %[[A]]) // ARC-NEXT: tail call void @llvm.objc.autoreleasePoolPop(ptr %[[A]]) // ARC-NEXT: %[[V1:.*]] = tail call ptr @llvm.objc.autoreleasePoolPush() @@ -88,7 +88,7 @@ unwindBlock: // ARC-NEXT: tail call void @llvm.objc.arc.annotation.bottomup.bbend(ptr %[[B]], ptr %[[C]]) // ARC-NEXT: invoke void @objc_autoreleasePoolPop(ptr %[[A]]) -// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality +// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality // NOUPGRADE: %[[V0:.*]] = tail call ptr @objc_autorelease(ptr %[[A]]) // NOUPGRADE-NEXT: tail call void @objc_autoreleasePoolPop(ptr %[[A]]) // NOUPGRADE-NEXT: %[[V1:.*]] = tail call ptr @objc_autoreleasePoolPush() diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll index 74d9c86881d52..ccf8cf67ede6d 100644 --- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll +++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll @@ -32,7 +32,6 @@ entry: ; CHECK-DUMP:
: ; CHECK-DUMP: bl 0x8 ; CHECK-DUMP: : -; CHECK-DUMP: paciasp ; `main` doesn't support BTI while `foo` does, so in the binary ; we should see only PAC which is supported by both. diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll deleted file mode 100644 index c25857ceed7b4..0000000000000 --- a/llvm/test/LTO/AArch64/link-sign-return-address.ll +++ /dev/null @@ -1,43 +0,0 @@ -; Testcase to check that module with different branch-target-enforcement can -; be mixed. -; -; RUN: llvm-as %s -o %t1.bc -; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc -; RUN: llvm-lto -exported-symbol main \ -; RUN: -exported-symbol foo \ -; RUN: -filetype=obj \ -; RUN: %t2.bc %t1.bc \ -; RUN: -o %t1.exe 2>&1 -; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s -; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" - -declare i32 @foo(); - -define i32 @main() { -entry: - %add = call i32 @foo() - ret i32 %add -} - -!llvm.module.flags = !{!0, !1, !2, !3 } -!0 = !{i32 8, !"branch-target-enforcement", i32 0} -!1 = !{i32 8, !"sign-return-address", i32 0} -!2 = !{i32 8, !"sign-return-address-all", i32 0} -!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0} - -; CHECK-DUMP: : -; CHECK-DUMP: paciasp -; CHECK-DUMP: mov w0, #0x2a -; CHECK-DUMP: autiasp -; CHECK-DUMP: ret -; CHECK-DUMP:
: -; CHECK-DUMP-NOT: paciasp -; CHECK-DUMP: str x30, -; CHECK-DUMP: bl 0x14 - -; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary -; we should not see anything. -; CHECK-PROP-NOT: Properties: aarch64 feature: PAC \ No newline at end of file diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll index 37bd8c37f8b5e..a90f2128e4430 100644 --- a/llvm/test/Linker/link-arm-and-thumb.ll +++ b/llvm/test/Linker/link-arm-and-thumb.ll @@ -13,12 +13,11 @@ entry: ret i32 %add } -; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]] +; CHECK: define i32 @main() { ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]] ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]] -; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} } -; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" } -; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" } +; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" } +; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" } ; STDERR-NOT: warning: Linking two modules of different target triples: From 2ae8bee8f11f8d5cc26cf6b4bb71001706ca0104 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Fri, 23 Feb 2024 10:28:58 +0100 Subject: [PATCH 322/351] [ARM][GlobalISel] Remove legacy legalizer rules (#82619) I've been looking at LegacyLegalizerInfo and what its place in GISel is. It seems like it's very close to being deleted so I'm checking if we can remove the last remaining uses of it. Looks like we can do a drop-in replacement with the new legalizer for ARM. --- llvm/lib/Target/ARM/ARMLegalizerInfo.cpp | 56 ++++-------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index c5199aab75272..00a29f8ecb232 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -25,42 +25,6 @@ using namespace llvm; using namespace LegalizeActions; -/// FIXME: The following static functions are SizeChangeStrategy functions -/// that are meant to temporarily mimic the behaviour of the old legalization -/// based on doubling/halving non-legal types as closely as possible. This is -/// not entirly possible as only legalizing the types that are exactly a power -/// of 2 times the size of the legal types would require specifying all those -/// sizes explicitly. -/// In practice, not specifying those isn't a problem, and the below functions -/// should disappear quickly as we add support for legalizing non-power-of-2 -/// sized types further. -static void addAndInterleaveWithUnsupported( - LegacyLegalizerInfo::SizeAndActionsVec &result, - const LegacyLegalizerInfo::SizeAndActionsVec &v) { - for (unsigned i = 0; i < v.size(); ++i) { - result.push_back(v[i]); - if (i + 1 < v[i].first && i + 1 < v.size() && - v[i + 1].first != v[i].first + 1) - result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported}); - } -} - -static LegacyLegalizerInfo::SizeAndActionsVec -widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) { - assert(v.size() >= 1); - assert(v[0].first > 17); - LegacyLegalizerInfo::SizeAndActionsVec result = { - {1, LegacyLegalizeActions::Unsupported}, - {8, LegacyLegalizeActions::WidenScalar}, - {9, LegacyLegalizeActions::Unsupported}, - {16, LegacyLegalizeActions::WidenScalar}, - {17, LegacyLegalizeActions::Unsupported}}; - addAndInterleaveWithUnsupported(result, v); - auto Largest = result.back().first; - result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported}); - return result; -} - static bool AEABI(const ARMSubtarget &ST) { return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI(); } @@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { .libcallFor({s32}) .clampScalar(0, s32, s32); - for (unsigned Op : {G_SREM, G_UREM}) { - LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16); - if (HasHWDivide) - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower); - else if (AEABI(ST)) - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom); - else - LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall); - } + auto &REMBuilder = + getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32); + if (HasHWDivide) + REMBuilder.lowerFor({s32}); + else if (AEABI(ST)) + REMBuilder.customFor({s32}); + else + REMBuilder.libcallFor({s32}); getActionDefinitionsBuilder(G_INTTOPTR) .legalFor({{p0, s32}}) @@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { LoadStoreBuilder.maxScalar(0, s32); - for (auto Ty : {s32, s64}) - LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower); + getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64}); getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64}); From bbdc62e7180168effd0c480979bdaf933d0615d1 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Fri, 23 Feb 2024 09:29:45 +0000 Subject: [PATCH 323/351] [AArch64][CostModel] Improve scalar frem cost (#80423) In AArch64 the cost of scalar frem is the cost of a call to 'fmod'. --- .../AArch64/AArch64TargetTransformInfo.cpp | 7 ++ .../CostModel/AArch64/arith-fp-frem.ll | 68 +++++++++---------- .../Analysis/CostModel/AArch64/arith-fp.ll | 22 +++--- 3 files changed, 52 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 6655931181c2d..010e569809e27 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); + case ISD::FREM: + // Pass nullptr as fmod/fmodf calls are emitted by the backend even when + // those functions are not declared in the module. + if (!Ty->isVectorTy()) + return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, + Op2Info); } } diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll index 20e0ef7ea3428..63149adfa2158 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll @@ -22,44 +22,44 @@ target triple = "aarch64-unknown-linux-gnu" define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; NEON-NO-VECLIB-LABEL: 'frem_f64' -; NEON-NO-VECLIB: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; NEON-NO-VECLIB: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; NEON-NO-VECLIB: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; NEON-NO-VECLIB: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in ; ; SVE-NO-VECLIB-LABEL: 'frem_f64' -; SVE-NO-VECLIB: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; SVE-NO-VECLIB: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; SVE-NO-VECLIB: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; SVE-NO-VECLIB: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in ; SVE-NO-VECLIB: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in ; SVE-NO-VECLIB: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in ; ; NEON-ARMPL-LABEL: 'frem_f64' -; NEON-ARMPL: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; NEON-ARMPL: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; NEON-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; NEON-ARMPL: LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in ; ; NEON-SLEEF-LABEL: 'frem_f64' -; NEON-SLEEF: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; NEON-SLEEF: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; NEON-SLEEF: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; NEON-SLEEF: LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in ; ; SVE-ARMPL-LABEL: 'frem_f64' -; SVE-ARMPL: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; SVE-ARMPL: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; SVE-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; SVE-ARMPL: LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in ; SVE-ARMPL: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in ; SVE-ARMPL: LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in ; ; SVE-SLEEF-LABEL: 'frem_f64' -; SVE-SLEEF: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; SVE-SLEEF: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; SVE-SLEEF: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; SVE-SLEEF: LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in ; SVE-SLEEF: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in ; SVE-SLEEF: LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in ; ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64' -; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in ; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in ; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in ; ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64' -; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in -; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in +; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in +; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in ; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in ; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in ; @@ -83,55 +83,55 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) { define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) { ; NEON-NO-VECLIB-LABEL: 'frem_f32' -; NEON-NO-VECLIB: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; NEON-NO-VECLIB: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in -; NEON-NO-VECLIB: LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in +; NEON-NO-VECLIB: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; NEON-NO-VECLIB: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in +; NEON-NO-VECLIB: LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in ; ; SVE-NO-VECLIB-LABEL: 'frem_f32' -; SVE-NO-VECLIB: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; SVE-NO-VECLIB: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in -; SVE-NO-VECLIB: LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in +; SVE-NO-VECLIB: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; SVE-NO-VECLIB: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in +; SVE-NO-VECLIB: LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in ; SVE-NO-VECLIB: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in ; SVE-NO-VECLIB: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in ; SVE-NO-VECLIB: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in ; ; NEON-ARMPL-LABEL: 'frem_f32' -; NEON-ARMPL: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; NEON-ARMPL: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in +; NEON-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; NEON-ARMPL: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in ; NEON-ARMPL: LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in ; ; NEON-SLEEF-LABEL: 'frem_f32' -; NEON-SLEEF: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; NEON-SLEEF: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in +; NEON-SLEEF: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; NEON-SLEEF: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in ; NEON-SLEEF: LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in ; ; SVE-ARMPL-LABEL: 'frem_f32' -; SVE-ARMPL: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; SVE-ARMPL: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in +; SVE-ARMPL: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; SVE-ARMPL: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in ; SVE-ARMPL: LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in ; SVE-ARMPL: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in ; SVE-ARMPL: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in ; SVE-ARMPL: LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in ; ; SVE-SLEEF-LABEL: 'frem_f32' -; SVE-SLEEF: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; SVE-SLEEF: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in +; SVE-SLEEF: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; SVE-SLEEF: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in ; SVE-SLEEF: LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in ; SVE-SLEEF: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in ; SVE-SLEEF: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in ; SVE-SLEEF: LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in ; ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32' -; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in +; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in ; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in ; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in ; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in ; SVE-ARMPL-TAILFOLD: LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in ; ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32' -; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in -; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in +; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in +; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in ; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in ; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in ; SVE-SLEEF-TAILFOLD: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll index c352892354fc2..497ade4f2f613 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll @@ -197,17 +197,17 @@ define i32 @fdiv(i32 %arg) { define i32 @frem(i32 %arg) { ; CHECK-LABEL: 'frem' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = frem <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F16 = frem <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F16 = frem <16 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = frem <2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = frem <4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F16 = frem half undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4F16 = frem <4 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8F16 = frem <8 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16F16 = frem <16 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = frem float undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F32 = frem <2 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4F32 = frem <4 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8F32 = frem <8 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = frem double undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = frem <2 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4F64 = frem <4 x double> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F16 = frem half undef, undef From 335d34d9eae8c943e2164373c7eab1e450eaf435 Mon Sep 17 00:00:00 2001 From: Tobias Gysi Date: Fri, 23 Feb 2024 10:30:19 +0100 Subject: [PATCH 324/351] [MLIR][LLVM] Fix debug intrinsic import (#82637) This revision handles the case that the translation of a scope fails due to cyclic metadata. This mainly affects the import of debug intrinsics that indirectly take such a scope as metadata argument (e.g. via local variable or label metadata). This commit ensures we drop intrinsics with such a dependency on cyclic metadata. --- .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 6 ++- mlir/lib/Target/LLVMIR/DebugImporter.cpp | 35 +++++++++++----- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 9 ++++- .../Target/LLVMIR/Import/import-failure.ll | 40 +++++++++++++++++-- 4 files changed, 73 insertions(+), 17 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index feb3578fe2d49..b88f1186a44b4 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -513,7 +513,11 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> { }); }]; let mlirBuilder = [{ - $_op = $_builder.create<$_qualCppClassName>($_location, $_label_attr($label)); + DILabelAttr labelAttr = $_label_attr($label); + // Drop the intrinsic if the label translation fails due to cylic metadata. + if (!labelAttr) + return success(); + $_op = $_builder.create<$_qualCppClassName>($_location, labelAttr); }]; let assemblyFormat = "$label attr-dict"; } diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp index 6521295230091..c631617f97354 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp +++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp @@ -99,21 +99,31 @@ DIFileAttr DebugImporter::translateImpl(llvm::DIFile *node) { } DILabelAttr DebugImporter::translateImpl(llvm::DILabel *node) { - return DILabelAttr::get(context, translate(node->getScope()), + // Return nullptr if the scope or type is a cyclic dependency. + DIScopeAttr scope = translate(node->getScope()); + if (node->getScope() && !scope) + return nullptr; + return DILabelAttr::get(context, scope, getStringAttrOrNull(node->getRawName()), translate(node->getFile()), node->getLine()); } DILexicalBlockAttr DebugImporter::translateImpl(llvm::DILexicalBlock *node) { - return DILexicalBlockAttr::get(context, translate(node->getScope()), - translate(node->getFile()), node->getLine(), - node->getColumn()); + // Return nullptr if the scope or type is a cyclic dependency. + DIScopeAttr scope = translate(node->getScope()); + if (node->getScope() && !scope) + return nullptr; + return DILexicalBlockAttr::get(context, scope, translate(node->getFile()), + node->getLine(), node->getColumn()); } DILexicalBlockFileAttr DebugImporter::translateImpl(llvm::DILexicalBlockFile *node) { - return DILexicalBlockFileAttr::get(context, translate(node->getScope()), - translate(node->getFile()), + // Return nullptr if the scope or type is a cyclic dependency. + DIScopeAttr scope = translate(node->getScope()); + if (node->getScope() && !scope) + return nullptr; + return DILexicalBlockFileAttr::get(context, scope, translate(node->getFile()), node->getDiscriminator()); } @@ -135,11 +145,14 @@ DebugImporter::translateImpl(llvm::DIGlobalVariable *node) { } DILocalVariableAttr DebugImporter::translateImpl(llvm::DILocalVariable *node) { - return DILocalVariableAttr::get(context, translate(node->getScope()), - getStringAttrOrNull(node->getRawName()), - translate(node->getFile()), node->getLine(), - node->getArg(), node->getAlignInBits(), - translate(node->getType())); + // Return nullptr if the scope or type is a cyclic dependency. + DIScopeAttr scope = translate(node->getScope()); + if (node->getScope() && !scope) + return nullptr; + return DILocalVariableAttr::get( + context, scope, getStringAttrOrNull(node->getRawName()), + translate(node->getFile()), node->getLine(), node->getArg(), + node->getAlignInBits(), translate(node->getType())); } DIScopeAttr DebugImporter::translateImpl(llvm::DIScope *node) { diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 97ccb2b29f3ae..d63ea12ecd49b 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1966,6 +1966,13 @@ ModuleImport::processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr, // TODO: find a way to support this case. if (isMetadataKillLocation(dbgIntr)) return emitUnsupportedWarning(); + // Drop debug intrinsics if the associated variable information cannot be + // translated due to cyclic debug metadata. + // TODO: Support cyclic debug metadata. + DILocalVariableAttr localVariableAttr = + matchLocalVariableAttr(dbgIntr->getArgOperand(1)); + if (!localVariableAttr) + return emitUnsupportedWarning(); FailureOr argOperand = convertMetadataValue(dbgIntr->getArgOperand(0)); if (failed(argOperand)) return emitError(loc) << "failed to convert a debug intrinsic operand: " @@ -1991,8 +1998,6 @@ ModuleImport::processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr, } else { builder.setInsertionPointAfterValue(*argOperand); } - DILocalVariableAttr localVariableAttr = - matchLocalVariableAttr(dbgIntr->getArgOperand(1)); auto locationExprAttr = debugImporter->translateExpression(dbgIntr->getExpression()); Operation *op = diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll index 0962134665663..9a4e939d10651 100644 --- a/mlir/test/Target/LLVMIR/Import/import-failure.ll +++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll @@ -59,13 +59,15 @@ define void @unhandled_intrinsic() gc "example" { ; // ----- +; Check that debug intrinsics with an unsupported argument are dropped. + declare void @llvm.dbg.value(metadata, metadata, metadata) ; CHECK: import-failure.ll -; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !DIArgList(i64 %arg1, i64 undef), metadata !3, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 1, DW_OP_mul, DW_OP_plus, DW_OP_stack_value)), !dbg !5 +; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !DIArgList(i64 %{{.*}}, i64 undef), metadata !3, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 1, DW_OP_mul, DW_OP_plus, DW_OP_stack_value)) ; CHECK: import-failure.ll -; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !6, metadata !3, metadata !DIExpression()), !dbg !5 -define void @dropped_instruction(i64 %arg1) { +; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !6, metadata !3, metadata !DIExpression()) +define void @unsupported_argument(i64 %arg1) { call void @llvm.dbg.value(metadata !DIArgList(i64 %arg1, i64 undef), metadata !3, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 1, DW_OP_mul, DW_OP_plus, DW_OP_stack_value)), !dbg !5 call void @llvm.dbg.value(metadata !6, metadata !3, metadata !DIExpression()), !dbg !5 ret void @@ -83,6 +85,38 @@ define void @dropped_instruction(i64 %arg1) { ; // ----- +; Check that debug intrinsics that depend on cyclic metadata are dropped. + +declare void @llvm.dbg.value(metadata, metadata, metadata) + +; CHECK: import-failure.ll +; CHECK-SAME: warning: dropped instruction: call void @llvm.dbg.label(metadata !{{.*}}) +; CHECK: import-failure.ll +; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata i64 %{{.*}}, metadata !3, metadata !DIExpression()) +define void @cylic_metadata(i64 %arg1) { + call void @llvm.dbg.value(metadata i64 %arg1, metadata !10, metadata !DIExpression()), !dbg !14 + call void @llvm.dbg.label(metadata !13), !dbg !14 + ret void +} + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!0} +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2) +!2 = !DIFile(filename: "import-failure.ll", directory: "/") +!3 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !4) +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3) +!5 = distinct !DISubprogram(name: "class_method", scope: !2, file: !2, type: !6, spFlags: DISPFlagDefinition, unit: !1) +!6 = !DISubroutineType(types: !7) +!7 = !{!3} +!10 = !DILocalVariable(scope: !5, name: "arg1", file: !2, line: 1, arg: 1, align: 64); +!11 = !DILexicalBlock(scope: !5) +!12 = !DILexicalBlockFile(scope: !11, discriminator: 0) +!13 = !DILabel(scope: !12, name: "label", file: !2, line: 42) +!14 = !DILocation(line: 1, column: 2, scope: !5) + +; // ----- + ; global_dtors with non-null data fields cannot be represented in MLIR. ; CHECK: ; CHECK-SAME: error: unhandled global variable: @llvm.global_dtors From a622b21f4607ee787c6fe63032a849c24374882b Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Feb 2024 10:31:55 +0100 Subject: [PATCH 325/351] [mlir][Transforms] Make `ConversionPatternRewriter` constructor private (#82244) `ConversionPatternRewriter` objects should not be constructed outside of dialect conversions. Some IR modifications performed through a `ConversionPatternRewriter` are reflected in the IR in a delayed fashion (e.g., only when the dialect conversion is guaranteed to succeed). Using a `ConversionPatternRewriter` outside of the dialect conversion is incorrect API usage and can bring the IR in an inconsistent state. Migration guide: Use `IRRewriter` instead of `ConversionPatternRewriter`. --- flang/lib/Frontend/FrontendActions.cpp | 2 +- .../mlir/Transforms/DialectConversion.h | 10 +++++++++- .../lib/Transforms/Utils/DialectConversion.cpp | 18 +++++++++++------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 44e80e946ed83..849b3c8e4dc02 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -177,7 +177,7 @@ static void addAMDGPUSpecificMLIRItems(mlir::ModuleOp &mlirModule, return; } - mlir::ConversionPatternRewriter builder(mlirModule.getContext()); + mlir::IRRewriter builder(mlirModule.getContext()); unsigned oclcABIVERsion = codeGenOpts.CodeObjectVersion; auto int32Type = builder.getI32Type(); diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 2575be4cdea1a..5c91a9498b35d 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -27,6 +27,7 @@ class Block; class ConversionPatternRewriter; class MLIRContext; class Operation; +struct OperationConverter; class Type; class Value; @@ -657,7 +658,6 @@ struct ConversionPatternRewriterImpl; /// hooks. class ConversionPatternRewriter final : public PatternRewriter { public: - explicit ConversionPatternRewriter(MLIRContext *ctx); ~ConversionPatternRewriter() override; /// Apply a signature conversion to the entry block of the given region. This @@ -764,6 +764,14 @@ class ConversionPatternRewriter final : public PatternRewriter { detail::ConversionPatternRewriterImpl &getImpl(); private: + // Allow OperationConverter to construct new rewriters. + friend struct OperationConverter; + + /// Conversion pattern rewriters must not be used outside of dialect + /// conversions. They apply some IR rewrites in a delayed fashion and could + /// bring the IR into an inconsistent state when used standalone. + explicit ConversionPatternRewriter(MLIRContext *ctx); + // Hide unsupported pattern rewriter API. using OpBuilder::setListener; diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 635a2cb00f388..2cdbfb78faf27 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -627,9 +627,11 @@ class ReplaceOperationRewrite : public OperationRewrite { void cleanup() override; -private: - friend struct OperationConverter; + const TypeConverter *getConverter() const { return converter; } + + bool hasChangedResults() const { return changedResults; } +private: /// An optional type converter that can be used to materialize conversions /// between the new and old values if necessary. const TypeConverter *converter; @@ -2387,7 +2389,9 @@ enum OpConversionMode { /// applied to the operations on success. Analysis, }; +} // namespace +namespace mlir { // This class converts operations to a given conversion target via a set of // rewrite patterns. The conversion behaves differently depending on the // conversion mode. @@ -2447,7 +2451,7 @@ struct OperationConverter { /// *not* to be legalizable to the target. DenseSet *trackedOps; }; -} // namespace +} // namespace mlir LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter, Operation *op) { @@ -2539,7 +2543,7 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { for (unsigned i = 0; i < rewriterImpl.rewrites.size(); ++i) { auto *opReplacement = dyn_cast(rewriterImpl.rewrites[i].get()); - if (!opReplacement || !opReplacement->changedResults) + if (!opReplacement || !opReplacement->hasChangedResults()) continue; Operation *op = opReplacement->getOperation(); for (OpResult result : op->getResults()) { @@ -2563,9 +2567,9 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) { // Legalize this result. rewriter.setInsertionPoint(op); - if (failed(legalizeChangedResultType(op, result, newValue, - opReplacement->converter, rewriter, - rewriterImpl, *inverseMapping))) + if (failed(legalizeChangedResultType( + op, result, newValue, opReplacement->getConverter(), rewriter, + rewriterImpl, *inverseMapping))) return failure(); } } From b39f5660a408b47307e57a0882eb8af85d72e283 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Fri, 23 Feb 2024 09:42:08 +0000 Subject: [PATCH 326/351] [mlir][ArmSME] Add test-lower-to-arm-sme pipeline (#81732) The ArmSME compilation pipeline has evolved significantly and is now sufficiently complex enough that it warrants a proper lowering pipeline that encapsulates the various passes and orderings. Currently the pipeline is loosely defined in our integration tests, but these have diverged and are not using the same passes or ordering everywhere. This patch introduces a test-lower-to-arm-sme pipeline mirroring test-lower-to-llvm that provides some sanity when running e2e examples and can be used a reference for targeting ArmSME in MLIR. All the integration tests are updated to use this pipeline. The intention is to productize the pipeline once it becomes more mature. --- .../Dialect/Linalg/CPU/ArmSME/fill-2d.mlir | 9 +- .../Linalg/CPU/ArmSME/matmul-transpose-a.mlir | 9 +- .../Dialect/Linalg/CPU/ArmSME/matmul.mlir | 8 +- .../Linalg/CPU/ArmSME/multi-tile-matmul.mlir | 6 +- .../Linalg/CPU/ArmSME/use-too-many-tiles.mlir | 7 +- .../CPU/ArmSME/load-store-128-bit-tile.mlir | 6 +- .../Vector/CPU/ArmSME/test-load-vertical.mlir | 6 +- .../CPU/ArmSME/test-multi-tile-transpose.mlir | 8 +- .../ArmSME/test-outerproduct-f16f16f32.mlir | 10 +- .../CPU/ArmSME/test-outerproduct-f32.mlir | 6 +- .../CPU/ArmSME/test-outerproduct-f64.mlir | 6 +- .../CPU/ArmSME/test-outerproduct-i8i8i32.mlir | 8 +- .../CPU/ArmSME/test-transfer-read-2d.mlir | 6 +- .../CPU/ArmSME/test-transfer-write-2d.mlir | 7 +- .../Vector/CPU/ArmSME/test-transpose.mlir | 6 +- .../Dialect/Vector/CPU/ArmSME/tile_fill.mlir | 6 +- .../Vector/CPU/ArmSME/vector-load-store.mlir | 6 +- .../Dialect/Vector/CPU/ArmSME/vector-ops.mlir | 5 +- mlir/test/lib/Dialect/ArmSME/CMakeLists.txt | 16 +++ .../lib/Dialect/ArmSME/TestLowerToArmSME.cpp | 99 +++++++++++++++++++ mlir/test/lib/Dialect/CMakeLists.txt | 1 + mlir/tools/mlir-opt/CMakeLists.txt | 1 + mlir/tools/mlir-opt/mlir-opt.cpp | 2 + 23 files changed, 141 insertions(+), 103 deletions(-) create mode 100644 mlir/test/lib/Dialect/ArmSME/CMakeLists.txt create mode 100644 mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir index 44ff1afe76d38..12f13e8dbc4a9 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir @@ -1,13 +1,8 @@ // RUN: mlir-opt %s \ -// RUN: -transform-interpreter \ -// RUN: -test-transform-dialect-erase-schedule \ +// RUN: -transform-interpreter -test-transform-dialect-erase-schedule \ // RUN: -lower-vector-mask \ // RUN: -one-shot-bufferize="bufferize-function-boundaries" \ -// RUN: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// RUN: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// RUN: -allocate-arm-sme-tiles -convert-arm-sme-to-scf \ -// RUN: -convert-arm-sme-to-llvm -cse -canonicalize \ -// RUN: -test-lower-to-llvm | \ +// RUN: -test-lower-to-arm-sme -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd \ // RUN: -e=entry -entry-point-result=void \ // RUN: -march=aarch64 -mattr="+sve,+sme" \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir index c781d5e0af846..34c5351c8703d 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir @@ -1,12 +1,7 @@ // RUN: mlir-opt %s \ // RUN: -transform-interpreter -test-transform-dialect-erase-schedule \ -// RUN: -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \ -// RUN: -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \ -// RUN: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// RUN: -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \ -// RUN: -convert-arm-sme-to-llvm \ -// RUN: -convert-vector-to-llvm=enable-arm-sve \ -// RUN: -cse -canonicalize -test-lower-to-llvm | \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -test-lower-to-arm-sme -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd \ // RUN: -e=main -entry-point-result=void \ // RUN: -march=aarch64 -mattr="+sve,+sme" \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir index 31c3202c3fc57..2bfdaa8e8a2be 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir @@ -1,12 +1,6 @@ // RUN: mlir-opt %s \ // RUN: -transform-interpreter -test-transform-dialect-erase-schedule \ -// RUN: -canonicalize \ -// RUN: -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \ -// RUN: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// RUN: -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \ -// RUN: -convert-arm-sme-to-llvm \ -// RUN: -convert-vector-to-llvm=enable-arm-sve \ -// RUN: -cse -canonicalize -test-lower-to-llvm | \ +// RUN: -test-lower-to-arm-sme -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd \ // RUN: -e=main -entry-point-result=void \ // RUN: -march=aarch64 -mattr="+sve,+sme" \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir index d5c35068ccb32..e376bdde24a15 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir @@ -1,11 +1,7 @@ // RUN: mlir-opt %s \ // RUN: -transform-interpreter -test-transform-dialect-erase-schedule \ // RUN: -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \ -// RUN: -arm-sme-vector-legalization -canonicalize -cse \ -// RUN: -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \ -// RUN: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// RUN: -convert-vector-to-scf=full-unroll -convert-arm-sme-to-llvm \ -// RUN: -test-lower-to-llvm | \ +// RUN: -test-lower-to-arm-sme -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd \ // RUN: -e=main -entry-point-result=void \ // RUN: -march=aarch64 -mattr="+sve,+sme" \ diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir index 42fe21cccd48a..ee3866de303e0 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir @@ -1,10 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// RUN: -allocate-arm-sme-tiles -convert-arm-sme-to-scf \ -// RUN: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// RUN: -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \ -// RUN: -convert-arm-sme-to-llvm -convert-vector-to-llvm=enable-arm-sve -cse \ -// RUN: -canonicalize -test-lower-to-llvm -verify-diagnostics | \ +// RUN: -test-lower-to-arm-sme -test-lower-to-llvm -verify-diagnostics | \ // RUN: %mcr_aarch64_cmd \ // RUN: -e=main -entry-point-result=void \ // RUN: -march=aarch64 -mattr="+sve,+sme" \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir index 59b4a7e6a52f9..06b1c107cb2c1 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir @@ -1,9 +1,5 @@ // DEFINE: %{entry_point} = test_load_store_zaq0 -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -allocate-arm-sme-tiles -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir index 064141c349241..27be801252b81 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir @@ -1,9 +1,5 @@ // DEFINE: %{entry_point} = entry -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir index 0827d9b7464ad..9d836d93c85bb 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir @@ -1,10 +1,4 @@ -// RUN: mlir-opt %s -arm-sme-vector-legalization -cse -canonicalize \ -// RUN: -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \ -// RUN: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// RUN: -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \ -// RUN: -convert-arm-sme-to-llvm \ -// RUN: -convert-vector-to-llvm=enable-arm-sve \ -// RUN: -cse -canonicalize -test-lower-to-llvm | \ +// RUN: mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd \ // RUN: -e=main -entry-point-result=void \ // RUN: -march=aarch64 -mattr="+sve,+sme" \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir index f081838300a9a..a06ad37b054e4 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir @@ -1,11 +1,7 @@ +// DEFINE: %{opts} = // DEFINE: %{entry} = main -// DEFINE: %{fusion_opts} = -arm-sme-outer-product-fusion // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -convert-vector-to-arm-sme -convert-arith-to-arm-sme %{fusion_opts} \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// DEFINE: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm -o %t +// DEFINE: -test-lower-to-arm-sme=%{opts} -test-lower-to-llvm -o %t // DEFINE: %{run} = %mcr_aarch64_cmd %t \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry} -entry-point-result=void \ @@ -18,7 +14,7 @@ // Check result is the same when outerproducts are not combined into widening // variant. -// REDEFINE: %{fusion_opts} = +// REDEFINE: %{opts} = fuse-outer-products=false // RUN: %{run} | FileCheck %s func.func @main() { diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir index 5f41b37560e76..7e7869d1c957a 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir @@ -1,10 +1,6 @@ // DEFINE: %{entry_point} = test_outerproduct_no_accumulator_4x4xf32 // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// DEFINE: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm -o %t +// DEFINE: -test-lower-to-arm-sme -test-lower-to-llvm -o %t // DEFINE: %{run} = %mcr_aarch64_cmd %t \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir index a1bb9b7d6f80e..46bf799232ae3 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir @@ -1,10 +1,6 @@ // DEFINE: %{entry_point} = test_outerproduct_no_accumulator_2x2xf64 // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// DEFINE: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm -o %t +// DEFINE: -test-lower-to-arm-sme -test-lower-to-llvm -o %t // DEFINE: %{run} = %mcr_aarch64_cmd %t \ // DEFINE: -march=aarch64 -mattr=+sve,+sme-f64f64 \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir index 1770e579f0bd6..9a353ec2d2f66 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir @@ -1,11 +1,5 @@ // DEFINE: %{entry} = main -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// DEFINE: -arm-sme-outer-product-fusion \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// DEFINE: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir index 6e028d5fb8361..52f56883cad9c 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir @@ -1,9 +1,5 @@ // DEFINE: %{entry_point} = entry -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir index c0c1f55d7ddd1..710cc6672f005 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir @@ -1,10 +1,5 @@ // DEFINE: %{entry_point} = entry -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// DEFINE: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir index eee3c56351d81..88bc0d0709d48 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir @@ -1,9 +1,5 @@ // DEFINE: %{entry_point} = entry -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir index 223bc8ce74343..e14917486d845 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir @@ -1,8 +1,4 @@ -// RUN: mlir-opt %s -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// RUN: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// RUN: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// RUN: -convert-arm-sme-to-llvm -cse -canonicalize \ -// RUN: -test-lower-to-llvm | \ +// RUN: mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm | \ // RUN: %mcr_aarch64_cmd \ // RUN: -march=aarch64 -mattr=+sve,+sme \ // RUN: -e entry -entry-point-result=i32 \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir index 2f151e2ec72fb..b29790db14ddc 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir @@ -1,9 +1,5 @@ // DEFINE: %{entry_point} = za0_d_f64 -// DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -cse -canonicalize \ -// DEFINE: -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=i32 \ diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir index f28bf19b29993..c8c401bed1446 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir @@ -1,8 +1,5 @@ // DEFINE: %{entry_point} = entry -// DEFINE: %{compile} = mlir-opt %s -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \ -// DEFINE: -convert-vector-to-arm-sme -convert-arith-to-arm-sme \ -// DEFINE: -convert-arm-sme-to-scf -allocate-arm-sme-tiles \ -// DEFINE: -convert-arm-sme-to-llvm -test-lower-to-llvm +// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm // DEFINE: %{run} = %mcr_aarch64_cmd \ // DEFINE: -march=aarch64 -mattr=+sve,+sme \ // DEFINE: -e %{entry_point} -entry-point-result=i32 \ diff --git a/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt b/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt new file mode 100644 index 0000000000000..de4971ff7eb3d --- /dev/null +++ b/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt @@ -0,0 +1,16 @@ +# Exclude tests from libMLIR.so +add_mlir_library(MLIRArmSMETestPasses + TestLowerToArmSME.cpp + + EXCLUDE_FROM_LIBMLIR + + LINK_LIBS PUBLIC + MLIRArithToArmSME + MLIRArmSMEToLLVM + MLIRArmSMEToSCF + MLIRIR + MLIRPass + MLIRTransforms + MLIRVectorToArmSME + MLIRVectorToSCF + ) diff --git a/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp b/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp new file mode 100644 index 0000000000000..48d4a5859f8a0 --- /dev/null +++ b/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp @@ -0,0 +1,99 @@ +//===- TestLowerToArmSME.cpp - Test lowering to ArmSME as a sink pass -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass for testing the lowering to ArmSME as a +// generally usable sink pass. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h" +#include "mlir/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.h" +#include "mlir/Conversion/ArmSMEToSCF/ArmSMEToSCF.h" +#include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h" +#include "mlir/Conversion/VectorToSCF/VectorToSCF.h" +#include "mlir/Dialect/ArmSME/Transforms/Passes.h" +#include "mlir/Dialect/ArmSVE/Transforms/Passes.h" +#include "mlir/IR/DialectRegistry.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { +struct TestLowerToArmSMEOptions + : public PassPipelineOptions { + PassOptions::Option fuseOuterProducts{ + *this, "fuse-outer-products", + llvm::cl::desc("Fuse outer product operations via " + "'-arm-sme-outer-product-fusion' pass"), + llvm::cl::init(true)}; +}; + +void buildTestLowerToArmSME(OpPassManager &pm, + const TestLowerToArmSMEOptions &options) { + // Legalize vector operations so they can be converted to ArmSME. + pm.addPass(arm_sme::createVectorLegalizationPass()); + + // Sprinkle some cleanups. + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + + // Passes that convert operations on vectors to ArmSME operations. + + // Convert Arith to ArmSME. + pm.addPass(createArithToArmSMEConversionPass()); + // Convert Vector to ArmSME. + pm.addPass(createConvertVectorToArmSMEPass()); + + // Fuse outer products. + if (options.fuseOuterProducts) + pm.addPass(arm_sme::createOuterProductFusionPass()); + + // Convert operations on high-level vectors to loops. + + // Convert ArmSME to SCF. + pm.addPass(createConvertArmSMEToSCFPass()); + + // Convert Vector to SCF (with full unroll enabled). + pm.addPass(createConvertVectorToSCFPass( + VectorTransferToSCFOptions().enableFullUnroll())); + + // Allocate tiles for ArmSME operations. + // + // Later passes may create further ArmSME ops that implement the + // ArmSMETileOpInterface, but tiles are allocated for root operations, + // all of which should now exist. + pm.addPass(arm_sme::createTileAllocationPass()); + + // Enable streaming-mode and ZA. + pm.addPass(arm_sme::createEnableArmStreamingPass( + arm_sme::ArmStreamingMode::StreamingLocally, arm_sme::ArmZaMode::NewZA, + /*onlyIfRequiredByOps=*/true)); + + // Convert ArmSME to LLVM. + pm.addPass(createConvertArmSMEToLLVMPass()); + + // Sprinkle some cleanups. + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); +} +} // namespace + +namespace mlir { +namespace test { +void registerTestLowerToArmSME() { + PassPipelineRegistration( + "test-lower-to-arm-sme", + "An example pipeline to lower operations on vectors (arith, vector) to " + "LLVM via ArmSME.", + buildTestLowerToArmSME); +} +} // namespace test +} // namespace mlir diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt index 30a17c201ff76..e20cd4473a358 100644 --- a/mlir/test/lib/Dialect/CMakeLists.txt +++ b/mlir/test/lib/Dialect/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(Affine) add_subdirectory(Arith) +add_subdirectory(ArmSME) add_subdirectory(Bufferization) add_subdirectory(ControlFlow) add_subdirectory(DLTI) diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt index 68aa6bad5f92c..701fc461b3b4e 100644 --- a/mlir/tools/mlir-opt/CMakeLists.txt +++ b/mlir/tools/mlir-opt/CMakeLists.txt @@ -17,6 +17,7 @@ if(MLIR_INCLUDE_TESTS) MLIRTestFuncToLLVM MLIRAffineTransformsTestPasses MLIRArithTestPasses + MLIRArmSMETestPasses MLIRBufferizationTestPasses MLIRControlFlowTestPasses MLIRDLTITestPasses diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index f11c6b4355fdd..4dfa05cc8ca88 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -109,6 +109,7 @@ void registerTestLoopFusion(); void registerTestCFGLoopInfoPass(); void registerTestLoopMappingPass(); void registerTestLoopUnrollingPass(); +void registerTestLowerToArmSME(); void registerTestLowerToLLVM(); void registerTestMakeIsolatedFromAbovePass(); void registerTestMatchReductionPass(); @@ -233,6 +234,7 @@ void registerTestPasses() { mlir::test::registerTestCFGLoopInfoPass(); mlir::test::registerTestLoopMappingPass(); mlir::test::registerTestLoopUnrollingPass(); + mlir::test::registerTestLowerToArmSME(); mlir::test::registerTestLowerToLLVM(); mlir::test::registerTestMakeIsolatedFromAbovePass(); mlir::test::registerTestMatchReductionPass(); From 78890904c41cc4221839dafb7ae906971a9db51a Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 23 Feb 2024 09:48:58 +0000 Subject: [PATCH 327/351] [mlir][math] Propagate scalability in `convert-math-to-llvm` (#82635) This also generally increases the coverage of scalable vector types in the math-to-llvm tests. --- mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp | 18 ++--- .../Conversion/MathToLLVM/math-to-llvm.mlir | 81 +++++++++++++++++++ 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp index 1b729611a3623..23e957288eb95 100644 --- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp +++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp @@ -148,10 +148,10 @@ struct ExpM1OpLowering : public ConvertOpToLLVMPattern { return LLVM::detail::handleMultidimensionalVectors( op.getOperation(), adaptor.getOperands(), *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { + auto numElements = LLVM::getVectorNumElements(llvm1DVectorTy); auto splatAttr = SplatElementsAttr::get( - mlir::VectorType::get( - {LLVM::getVectorNumElements(llvm1DVectorTy).getFixedValue()}, - floatType), + mlir::VectorType::get({numElements.getKnownMinValue()}, floatType, + {numElements.isScalable()}), floatOne); auto one = rewriter.create(loc, llvm1DVectorTy, splatAttr); @@ -207,10 +207,10 @@ struct Log1pOpLowering : public ConvertOpToLLVMPattern { return LLVM::detail::handleMultidimensionalVectors( op.getOperation(), adaptor.getOperands(), *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { + auto numElements = LLVM::getVectorNumElements(llvm1DVectorTy); auto splatAttr = SplatElementsAttr::get( - mlir::VectorType::get( - {LLVM::getVectorNumElements(llvm1DVectorTy).getFixedValue()}, - floatType), + mlir::VectorType::get({numElements.getKnownMinValue()}, floatType, + {numElements.isScalable()}), floatOne); auto one = rewriter.create(loc, llvm1DVectorTy, splatAttr); @@ -266,10 +266,10 @@ struct RsqrtOpLowering : public ConvertOpToLLVMPattern { return LLVM::detail::handleMultidimensionalVectors( op.getOperation(), adaptor.getOperands(), *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { + auto numElements = LLVM::getVectorNumElements(llvm1DVectorTy); auto splatAttr = SplatElementsAttr::get( - mlir::VectorType::get( - {LLVM::getVectorNumElements(llvm1DVectorTy).getFixedValue()}, - floatType), + mlir::VectorType::get({numElements.getKnownMinValue()}, floatType, + {numElements.isScalable()}), floatOne); auto one = rewriter.create(loc, llvm1DVectorTy, splatAttr); diff --git a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir index 3de2f11d1d12c..56129dbd27889 100644 --- a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir +++ b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir @@ -77,6 +77,18 @@ func.func @log1p_2dvector_fmf(%arg0 : vector<4x3xf32>) { // ----- +// CHECK-LABEL: func @log1p_scalable_vector( +// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32> +func.func @log1p_scalable_vector(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> { + // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32> + // CHECK: %[[ADD:.*]] = llvm.fadd %[[ONE]], %[[VEC]] : vector<[4]xf32> + // CHECK: %[[LOG:.*]] = llvm.intr.log(%[[ADD]]) : (vector<[4]xf32>) -> vector<[4]xf32> + %0 = math.log1p %arg0 : vector<[4]xf32> + func.return %0 : vector<[4]xf32> +} + +// ----- + // CHECK-LABEL: func @expm1( // CHECK-SAME: f32 func.func @expm1(%arg0 : f32) { @@ -113,6 +125,18 @@ func.func @expm1_vector(%arg0 : vector<4xf32>) { // ----- +// CHECK-LABEL: func @expm1_scalable_vector( +// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32> +func.func @expm1_scalable_vector(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> { + // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32> + // CHECK: %[[EXP:.*]] = llvm.intr.exp(%[[VEC]]) : (vector<[4]xf32>) -> vector<[4]xf32> + // CHECK: %[[SUB:.*]] = llvm.fsub %[[EXP]], %[[ONE]] : vector<[4]xf32> + %0 = math.expm1 %arg0 : vector<[4]xf32> + func.return %0 : vector<[4]xf32> +} + +// ----- + // CHECK-LABEL: func @expm1_vector_fmf( // CHECK-SAME: vector<4xf32> func.func @expm1_vector_fmf(%arg0 : vector<4xf32>) { @@ -177,6 +201,16 @@ func.func @cttz_vec(%arg0 : vector<4xi32>) { // ----- +// CHECK-LABEL: func @cttz_scalable_vec( +// CHECK-SAME: %[[VEC:.*]]: vector<[4]xi32> +func.func @cttz_scalable_vec(%arg0 : vector<[4]xi32>) -> vector<[4]xi32> { + // CHECK: "llvm.intr.cttz"(%[[VEC]]) <{is_zero_poison = false}> : (vector<[4]xi32>) -> vector<[4]xi32> + %0 = math.cttz %arg0 : vector<[4]xi32> + func.return %0 : vector<[4]xi32> +} + +// ----- + // CHECK-LABEL: func @ctpop( // CHECK-SAME: i32 func.func @ctpop(%arg0 : i32) { @@ -197,6 +231,16 @@ func.func @ctpop_vector(%arg0 : vector<3xi32>) { // ----- +// CHECK-LABEL: func @ctpop_scalable_vector( +// CHECK-SAME: %[[VEC:.*]]: vector<[4]xi32> +func.func @ctpop_scalable_vector(%arg0 : vector<[4]xi32>) -> vector<[4]xi32> { + // CHECK: llvm.intr.ctpop(%[[VEC]]) : (vector<[4]xi32>) -> vector<[4]xi32> + %0 = math.ctpop %arg0 : vector<[4]xi32> + func.return %0 : vector<[4]xi32> +} + +// ----- + // CHECK-LABEL: func @rsqrt_double( // CHECK-SAME: f64 func.func @rsqrt_double(%arg0 : f64) { @@ -233,6 +277,18 @@ func.func @rsqrt_vector(%arg0 : vector<4xf32>) { // ----- +// CHECK-LABEL: func @rsqrt_scalable_vector( +// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32> +func.func @rsqrt_scalable_vector(%arg0 : vector<[4]xf32>) -> vector<[4]xf32>{ + // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32> + // CHECK: %[[SQRT:.*]] = llvm.intr.sqrt(%[[VEC]]) : (vector<[4]xf32>) -> vector<[4]xf32> + // CHECK: %[[DIV:.*]] = llvm.fdiv %[[ONE]], %[[SQRT]] : vector<[4]xf32> + %0 = math.rsqrt %arg0 : vector<[4]xf32> + func.return %0 : vector<[4]xf32> +} + +// ----- + // CHECK-LABEL: func @rsqrt_vector_fmf( // CHECK-SAME: vector<4xf32> func.func @rsqrt_vector_fmf(%arg0 : vector<4xf32>) { @@ -245,6 +301,18 @@ func.func @rsqrt_vector_fmf(%arg0 : vector<4xf32>) { // ----- +// CHECK-LABEL: func @rsqrt_scalable_vector_fmf( +// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32> +func.func @rsqrt_scalable_vector_fmf(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> { + // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32> + // CHECK: %[[SQRT:.*]] = llvm.intr.sqrt(%[[VEC]]) {fastmathFlags = #llvm.fastmath} : (vector<[4]xf32>) -> vector<[4]xf32> + // CHECK: %[[DIV:.*]] = llvm.fdiv %[[ONE]], %[[SQRT]] {fastmathFlags = #llvm.fastmath} : vector<[4]xf32> + %0 = math.rsqrt %arg0 fastmath : vector<[4]xf32> + func.return %0 : vector<[4]xf32> +} + +// ----- + // CHECK-LABEL: func @rsqrt_multidim_vector( func.func @rsqrt_multidim_vector(%arg0 : vector<4x3xf32>) { // CHECK: %[[EXTRACT:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.array<4 x vector<3xf32>> @@ -258,6 +326,19 @@ func.func @rsqrt_multidim_vector(%arg0 : vector<4x3xf32>) { // ----- +// CHECK-LABEL: func @rsqrt_multidim_scalable_vector( +func.func @rsqrt_multidim_scalable_vector(%arg0 : vector<4x[4]xf32>) -> vector<4x[4]xf32> { + // CHECK: %[[EXTRACT:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.array<4 x vector<[4]xf32>> + // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32> + // CHECK: %[[SQRT:.*]] = llvm.intr.sqrt(%[[EXTRACT]]) : (vector<[4]xf32>) -> vector<[4]xf32> + // CHECK: %[[DIV:.*]] = llvm.fdiv %[[ONE]], %[[SQRT]] : vector<[4]xf32> + // CHECK: %[[INSERT:.*]] = llvm.insertvalue %[[DIV]], %{{.*}}[0] : !llvm.array<4 x vector<[4]xf32>> + %0 = math.rsqrt %arg0 : vector<4x[4]xf32> + func.return %0 : vector<4x[4]xf32> +} + +// ----- + // CHECK-LABEL: func @fpowi( // CHECK-SAME: f64 func.func @fpowi(%arg0 : f64, %arg1 : i32) { From 13acb3af5ad48e850cf37dcf02270ede3f267bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 23 Feb 2024 10:52:28 +0100 Subject: [PATCH 328/351] [clang][Interp] Don't diagnose alread invalid function decls They have already been diagnosed before. Also improve that test case. --- clang/lib/AST/Interp/Interp.cpp | 4 ++++ clang/test/SemaCXX/PR68542.cpp | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 82bc1f240cc51..b2fe70dc14f9d 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -462,6 +462,10 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { if (S.getLangOpts().CPlusPlus11) { const FunctionDecl *DiagDecl = F->getDecl(); + // Invalid decls have been diagnosed before. + if (DiagDecl->isInvalidDecl()) + return false; + // If this function is not constexpr because it is an inherited // non-constexpr constructor, diagnose that directly. const auto *CD = dyn_cast(DiagDecl); diff --git a/clang/test/SemaCXX/PR68542.cpp b/clang/test/SemaCXX/PR68542.cpp index fc767a78c8b00..e266bf9ba77ab 100644 --- a/clang/test/SemaCXX/PR68542.cpp +++ b/clang/test/SemaCXX/PR68542.cpp @@ -1,20 +1,20 @@ // RUN: %clang_cc1 -verify -std=c++20 -fsyntax-only %s +// RUN: %clang_cc1 -verify -std=c++20 -fsyntax-only %s -fexperimental-new-constant-interpreter -struct S { +struct S { // expected-note {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'S &&' for 1st argument}} \ + // expected-note {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const S &' for 1st argument}} int e; }; template consteval int get_format() { - return nullptr; // expected-error{{cannot initialize return object of type 'int' with an rvalue of type 'std::nullptr_t'}} + return nullptr; // expected-error {{cannot initialize return object of type 'int' with an rvalue of type 'std::nullptr_t'}} } template constexpr S f(T) noexcept { - return get_format(); // expected-error{{no viable conversion from returned value of type 'int' to function return type 'S'}} + return get_format(); // expected-error {{no viable conversion from returned value of type 'int' to function return type 'S'}} } -constexpr S x = f(0); // expected-error{{constexpr variable 'x' must be initialized by a constant expression}} -// expected-note@-1{{in instantiation of function template specialization 'f' requested here}} -// expected-note@3{{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'S &&' for 1st argument}} -// expected-note@3{{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const S &' for 1st argument}} +constexpr S x = f(0); // expected-error {{constexpr variable 'x' must be initialized by a constant expression}} \ + // expected-note {{in instantiation of function template specialization 'f' requested here}} From 5f1319bb385342c7ef4124b05b83b89ef8588ee8 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Feb 2024 11:28:05 +0100 Subject: [PATCH 329/351] [mlir][Transforms] Encapsulate dialect conversion options in `ConversionConfig` (#82250) This commit adds a new `ConversionConfig` struct that allows users to customize the dialect conversion. This configuration is similar to `GreedyRewriteConfig` for the greedy pattern rewrite driver. A few existing options are moved to this objects, simplifying the dialect conversion API. --- .../mlir/Transforms/DialectConversion.h | 75 ++++++---- .../Transforms/Utils/DialectConversion.cpp | 134 ++++++++---------- mlir/test/lib/Dialect/Test/TestPatterns.cpp | 14 +- 3 files changed, 118 insertions(+), 105 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 5c91a9498b35d..7e8e67a9d1782 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -24,6 +24,7 @@ namespace mlir { // Forward declarations. class Attribute; class Block; +struct ConversionConfig; class ConversionPatternRewriter; class MLIRContext; class Operation; @@ -770,7 +771,8 @@ class ConversionPatternRewriter final : public PatternRewriter { /// Conversion pattern rewriters must not be used outside of dialect /// conversions. They apply some IR rewrites in a delayed fashion and could /// bring the IR into an inconsistent state when used standalone. - explicit ConversionPatternRewriter(MLIRContext *ctx); + explicit ConversionPatternRewriter(MLIRContext *ctx, + const ConversionConfig &config); // Hide unsupported pattern rewriter API. using OpBuilder::setListener; @@ -1070,6 +1072,30 @@ class PDLConversionConfig final { #endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH +//===----------------------------------------------------------------------===// +// ConversionConfig +//===----------------------------------------------------------------------===// + +/// Dialect conversion configuration. +struct ConversionConfig { + /// An optional callback used to notify about match failure diagnostics during + /// the conversion. Diagnostics reported to this callback may only be + /// available in debug mode. + function_ref notifyCallback = nullptr; + + /// Partial conversion only. All operations that are found not to be + /// legalizable are placed in this set. (Note that if there is an op + /// explicitly marked as illegal, the conversion terminates and the set will + /// not necessarily be complete.) + DenseSet *unlegalizedOps = nullptr; + + /// Analysis conversion only. All operations that are found to be legalizable + /// are placed in this set. Note that no actual rewrites are applied to the + /// IR during an analysis conversion and only pre-existing operations are + /// added to the set. + DenseSet *legalizableOps = nullptr; +}; + //===----------------------------------------------------------------------===// // Op Conversion Entry Points //===----------------------------------------------------------------------===// @@ -1083,19 +1109,16 @@ class PDLConversionConfig final { /// Apply a partial conversion on the given operations and all nested /// operations. This method converts as many operations to the target as /// possible, ignoring operations that failed to legalize. This method only -/// returns failure if there ops explicitly marked as illegal. If an -/// `unconvertedOps` set is provided, all operations that are found not to be -/// legalizable to the given `target` are placed within that set. (Note that if -/// there is an op explicitly marked as illegal, the conversion terminates and -/// the `unconvertedOps` set will not necessarily be complete.) +/// returns failure if there ops explicitly marked as illegal. LogicalResult -applyPartialConversion(ArrayRef ops, const ConversionTarget &target, +applyPartialConversion(ArrayRef ops, + const ConversionTarget &target, const FrozenRewritePatternSet &patterns, - DenseSet *unconvertedOps = nullptr); + ConversionConfig config = ConversionConfig()); LogicalResult applyPartialConversion(Operation *op, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, - DenseSet *unconvertedOps = nullptr); + ConversionConfig config = ConversionConfig()); /// Apply a complete conversion on the given operations, and all nested /// operations. This method returns failure if the conversion of any operation @@ -1103,31 +1126,27 @@ applyPartialConversion(Operation *op, const ConversionTarget &target, /// within 'ops'. LogicalResult applyFullConversion(ArrayRef ops, const ConversionTarget &target, - const FrozenRewritePatternSet &patterns); + const FrozenRewritePatternSet &patterns, + ConversionConfig config = ConversionConfig()); LogicalResult applyFullConversion(Operation *op, const ConversionTarget &target, - const FrozenRewritePatternSet &patterns); + const FrozenRewritePatternSet &patterns, + ConversionConfig config = ConversionConfig()); /// Apply an analysis conversion on the given operations, and all nested /// operations. This method analyzes which operations would be successfully /// converted to the target if a conversion was applied. All operations that /// were found to be legalizable to the given 'target' are placed within the -/// provided 'convertedOps' set; note that no actual rewrites are applied to the -/// operations on success and only pre-existing operations are added to the set. -/// This method only returns failure if there are unreachable blocks in any of -/// the regions nested within 'ops'. There's an additional argument -/// `notifyCallback` which is used for collecting match failure diagnostics -/// generated during the conversion. Diagnostics are only reported to this -/// callback may only be available in debug mode. -LogicalResult applyAnalysisConversion( - ArrayRef ops, ConversionTarget &target, - const FrozenRewritePatternSet &patterns, - DenseSet &convertedOps, - function_ref notifyCallback = nullptr); -LogicalResult applyAnalysisConversion( - Operation *op, ConversionTarget &target, - const FrozenRewritePatternSet &patterns, - DenseSet &convertedOps, - function_ref notifyCallback = nullptr); +/// provided 'config.legalizableOps' set; note that no actual rewrites are +/// applied to the operations on success. This method only returns failure if +/// there are unreachable blocks in any of the regions nested within 'ops'. +LogicalResult +applyAnalysisConversion(ArrayRef ops, ConversionTarget &target, + const FrozenRewritePatternSet &patterns, + ConversionConfig config = ConversionConfig()); +LogicalResult +applyAnalysisConversion(Operation *op, ConversionTarget &target, + const FrozenRewritePatternSet &patterns, + ConversionConfig config = ConversionConfig()); } // namespace mlir #endif // MLIR_TRANSFORMS_DIALECTCONVERSION_H_ diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 2cdbfb78faf27..508ee7416d55d 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -228,6 +228,8 @@ class IRRewrite { /// Erase the given block (unless it was already erased). void eraseBlock(Block *block); + const ConversionConfig &getConfig() const; + const Kind kind; ConversionPatternRewriterImpl &rewriterImpl; }; @@ -754,9 +756,10 @@ static RewriteTy *findSingleRewrite(R &&rewrites, Block *block) { namespace mlir { namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { - explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter) + explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter, + const ConversionConfig &config) : rewriter(rewriter), eraseRewriter(rewriter.getContext()), - notifyCallback(nullptr) {} + config(config) {} //===--------------------------------------------------------------------===// // State Management @@ -962,14 +965,8 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// converting the arguments of blocks within that region. DenseMap regionToConverter; - /// This allows the user to collect the match failure message. - function_ref notifyCallback; - - /// A set of pre-existing operations. When mode == OpConversionMode::Analysis, - /// this is populated with ops found to be legalizable to the target. - /// When mode == OpConversionMode::Partial, this is populated with ops found - /// *not* to be legalizable to the target. - DenseSet *trackedOps = nullptr; + /// Dialect conversion configuration. + const ConversionConfig &config; #ifndef NDEBUG /// A set of operations that have pending updates. This tracking isn't @@ -992,6 +989,10 @@ void IRRewrite::eraseBlock(Block *block) { rewriterImpl.eraseRewriter.eraseBlock(block); } +const ConversionConfig &IRRewrite::getConfig() const { + return rewriterImpl.config; +} + void BlockTypeConversionRewrite::commit() { // Process the remapping for each of the original arguments. for (auto [origArg, info] : @@ -1107,8 +1108,8 @@ void ReplaceOperationRewrite::commit() { if (Value newValue = rewriterImpl.mapping.lookupOrNull(result, result.getType())) result.replaceAllUsesWith(newValue); - if (rewriterImpl.trackedOps) - rewriterImpl.trackedOps->erase(op); + if (getConfig().unlegalizedOps) + getConfig().unlegalizedOps->erase(op); // Do not erase the operation yet. It may still be referenced in `mapping`. op->getBlock()->getOperations().remove(op); } @@ -1543,8 +1544,8 @@ void ConversionPatternRewriterImpl::notifyMatchFailure( Diagnostic diag(loc, DiagnosticSeverity::Remark); reasonCallback(diag); logger.startLine() << "** Failure : " << diag.str() << "\n"; - if (notifyCallback) - notifyCallback(diag); + if (config.notifyCallback) + config.notifyCallback(diag); }); } @@ -1552,9 +1553,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure( // ConversionPatternRewriter //===----------------------------------------------------------------------===// -ConversionPatternRewriter::ConversionPatternRewriter(MLIRContext *ctx) +ConversionPatternRewriter::ConversionPatternRewriter( + MLIRContext *ctx, const ConversionConfig &config) : PatternRewriter(ctx), - impl(new detail::ConversionPatternRewriterImpl(*this)) { + impl(new detail::ConversionPatternRewriterImpl(*this, config)) { setListener(impl.get()); } @@ -2005,12 +2007,12 @@ OperationLegalizer::legalizeWithPattern(Operation *op, assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates"); LLVM_DEBUG({ logFailure(rewriterImpl.logger, "pattern failed to match"); - if (rewriterImpl.notifyCallback) { + if (rewriterImpl.config.notifyCallback) { Diagnostic diag(op->getLoc(), DiagnosticSeverity::Remark); diag << "Failed to apply pattern \"" << pattern.getDebugName() << "\" on op:\n" << *op; - rewriterImpl.notifyCallback(diag); + rewriterImpl.config.notifyCallback(diag); } }); rewriterImpl.resetState(curState); @@ -2398,14 +2400,12 @@ namespace mlir { struct OperationConverter { explicit OperationConverter(const ConversionTarget &target, const FrozenRewritePatternSet &patterns, - OpConversionMode mode, - DenseSet *trackedOps = nullptr) - : opLegalizer(target, patterns), mode(mode), trackedOps(trackedOps) {} + const ConversionConfig &config, + OpConversionMode mode) + : opLegalizer(target, patterns), config(config), mode(mode) {} /// Converts the given operations to the conversion target. - LogicalResult - convertOperations(ArrayRef ops, - function_ref notifyCallback = nullptr); + LogicalResult convertOperations(ArrayRef ops); private: /// Converts an operation with the given rewriter. @@ -2442,14 +2442,11 @@ struct OperationConverter { /// The legalizer to use when converting operations. OperationLegalizer opLegalizer; + /// Dialect conversion configuration. + ConversionConfig config; + /// The conversion mode to use when legalizing operations. OpConversionMode mode; - - /// A set of pre-existing operations. When mode == OpConversionMode::Analysis, - /// this is populated with ops found to be legalizable to the target. - /// When mode == OpConversionMode::Partial, this is populated with ops found - /// *not* to be legalizable to the target. - DenseSet *trackedOps; }; } // namespace mlir @@ -2463,28 +2460,27 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter, return op->emitError() << "failed to legalize operation '" << op->getName() << "'"; // Partial conversions allow conversions to fail iff the operation was not - // explicitly marked as illegal. If the user provided a nonlegalizableOps - // set, non-legalizable ops are included. + // explicitly marked as illegal. If the user provided a `unlegalizedOps` + // set, non-legalizable ops are added to that set. if (mode == OpConversionMode::Partial) { if (opLegalizer.isIllegal(op)) return op->emitError() << "failed to legalize operation '" << op->getName() << "' that was explicitly marked illegal"; - if (trackedOps) - trackedOps->insert(op); + if (config.unlegalizedOps) + config.unlegalizedOps->insert(op); } } else if (mode == OpConversionMode::Analysis) { // Analysis conversions don't fail if any operations fail to legalize, // they are only interested in the operations that were successfully // legalized. - trackedOps->insert(op); + if (config.legalizableOps) + config.legalizableOps->insert(op); } return success(); } -LogicalResult OperationConverter::convertOperations( - ArrayRef ops, - function_ref notifyCallback) { +LogicalResult OperationConverter::convertOperations(ArrayRef ops) { if (ops.empty()) return success(); const ConversionTarget &target = opLegalizer.getTarget(); @@ -2505,10 +2501,8 @@ LogicalResult OperationConverter::convertOperations( } // Convert each operation and discard rewrites on failure. - ConversionPatternRewriter rewriter(ops.front()->getContext()); + ConversionPatternRewriter rewriter(ops.front()->getContext(), config); ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl(); - rewriterImpl.notifyCallback = notifyCallback; - rewriterImpl.trackedOps = trackedOps; for (auto *op : toConvert) if (failed(convert(rewriter, op))) @@ -3495,57 +3489,51 @@ void mlir::registerConversionPDLFunctions(RewritePatternSet &patterns) { //===----------------------------------------------------------------------===// // Partial Conversion -LogicalResult -mlir::applyPartialConversion(ArrayRef ops, - const ConversionTarget &target, - const FrozenRewritePatternSet &patterns, - DenseSet *unconvertedOps) { - OperationConverter opConverter(target, patterns, OpConversionMode::Partial, - unconvertedOps); +LogicalResult mlir::applyPartialConversion( + ArrayRef ops, const ConversionTarget &target, + const FrozenRewritePatternSet &patterns, ConversionConfig config) { + OperationConverter opConverter(target, patterns, config, + OpConversionMode::Partial); return opConverter.convertOperations(ops); } LogicalResult mlir::applyPartialConversion(Operation *op, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, - DenseSet *unconvertedOps) { - return applyPartialConversion(llvm::ArrayRef(op), target, patterns, - unconvertedOps); + ConversionConfig config) { + return applyPartialConversion(llvm::ArrayRef(op), target, patterns, config); } //===----------------------------------------------------------------------===// // Full Conversion -LogicalResult -mlir::applyFullConversion(ArrayRef ops, - const ConversionTarget &target, - const FrozenRewritePatternSet &patterns) { - OperationConverter opConverter(target, patterns, OpConversionMode::Full); +LogicalResult mlir::applyFullConversion(ArrayRef ops, + const ConversionTarget &target, + const FrozenRewritePatternSet &patterns, + ConversionConfig config) { + OperationConverter opConverter(target, patterns, config, + OpConversionMode::Full); return opConverter.convertOperations(ops); } -LogicalResult -mlir::applyFullConversion(Operation *op, const ConversionTarget &target, - const FrozenRewritePatternSet &patterns) { - return applyFullConversion(llvm::ArrayRef(op), target, patterns); +LogicalResult mlir::applyFullConversion(Operation *op, + const ConversionTarget &target, + const FrozenRewritePatternSet &patterns, + ConversionConfig config) { + return applyFullConversion(llvm::ArrayRef(op), target, patterns, config); } //===----------------------------------------------------------------------===// // Analysis Conversion -LogicalResult -mlir::applyAnalysisConversion(ArrayRef ops, - ConversionTarget &target, - const FrozenRewritePatternSet &patterns, - DenseSet &convertedOps, - function_ref notifyCallback) { - OperationConverter opConverter(target, patterns, OpConversionMode::Analysis, - &convertedOps); - return opConverter.convertOperations(ops, notifyCallback); +LogicalResult mlir::applyAnalysisConversion( + ArrayRef ops, ConversionTarget &target, + const FrozenRewritePatternSet &patterns, ConversionConfig config) { + OperationConverter opConverter(target, patterns, config, + OpConversionMode::Analysis); + return opConverter.convertOperations(ops); } LogicalResult mlir::applyAnalysisConversion(Operation *op, ConversionTarget &target, const FrozenRewritePatternSet &patterns, - DenseSet &convertedOps, - function_ref notifyCallback) { - return applyAnalysisConversion(llvm::ArrayRef(op), target, patterns, - convertedOps, notifyCallback); + ConversionConfig config) { + return applyAnalysisConversion(llvm::ArrayRef(op), target, patterns, config); } diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 108cfe8950ef6..bde4255ee4b36 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1152,8 +1152,10 @@ struct TestLegalizePatternDriver // Handle a partial conversion. if (mode == ConversionMode::Partial) { DenseSet unlegalizedOps; - if (failed(applyPartialConversion( - getOperation(), target, std::move(patterns), &unlegalizedOps))) { + ConversionConfig config; + config.unlegalizedOps = &unlegalizedOps; + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns), config))) { getOperation()->emitRemark() << "applyPartialConversion failed"; } // Emit remarks for each legalizable operation. @@ -1181,8 +1183,10 @@ struct TestLegalizePatternDriver // Analyze the convertible operations. DenseSet legalizedOps; + ConversionConfig config; + config.legalizableOps = &legalizedOps; if (failed(applyAnalysisConversion(getOperation(), target, - std::move(patterns), legalizedOps))) + std::move(patterns), config))) return signalPassFailure(); // Emit remarks for each legalizable operation. @@ -1806,8 +1810,10 @@ struct TestMergeBlocksPatternDriver }); DenseSet unlegalizedOps; + ConversionConfig config; + config.unlegalizedOps = &unlegalizedOps; (void)applyPartialConversion(getOperation(), target, std::move(patterns), - &unlegalizedOps); + config); for (auto *op : unlegalizedOps) op->emitRemark() << "op '" << op->getName() << "' is not legalizable"; } From 5cb2ebc08f6fa42341409b88466c5c266e5839cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Fri, 23 Feb 2024 11:37:30 +0100 Subject: [PATCH 330/351] Reland "[clang] Preserve found-decl when constructing VarTemplateIds" (#82612) Update include-cleaner tests. Now that we have proper found-decls set up for VarTemplates, in case of instationtations we point to primary templates and not specializations. To be changed in a follow-up patch. --- .../include-cleaner/unittests/WalkASTTest.cpp | 16 ++++++++-------- clang/include/clang/Sema/Sema.h | 2 +- clang/lib/Sema/SemaTemplate.cpp | 18 ++++++++---------- clang/test/AST/ast-dump-using.cpp | 7 +++++++ 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp index bdfc24b8edee3..0be5db36b1fc5 100644 --- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp +++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp @@ -200,24 +200,24 @@ TEST(WalkAST, VarTemplates) { EXPECT_THAT(testWalk(R"cpp( template T $explicit^Foo = 0;)cpp", "int z = ^Foo;"), - ElementsAre(Decl::VarTemplateSpecialization)); + ElementsAre(Decl::VarTemplate)); EXPECT_THAT(testWalk(R"cpp( - template T Foo = 0; - template<> int $explicit^Foo = 1;)cpp", + template T $explicit^Foo = 0; + template<> int Foo = 1;)cpp", "int x = ^Foo;"), - ElementsAre(Decl::VarTemplateSpecialization)); + ElementsAre(Decl::VarTemplate)); // FIXME: This points at implicit specialization, instead we should point to // explicit partial specializaiton pattern. EXPECT_THAT(testWalk(R"cpp( - template T Foo = 0; - template T* $explicit^Foo = nullptr;)cpp", + template T $explicit^Foo = 0; + template T* Foo = nullptr;)cpp", "int *x = ^Foo;"), - ElementsAre(Decl::VarTemplateSpecialization)); + ElementsAre(Decl::VarTemplate)); EXPECT_THAT(testWalk(R"cpp( template T $explicit^Foo = 0; template int Foo;)cpp", "int x = ^Foo;"), - ElementsAre(Decl::VarTemplateSpecialization)); + ElementsAre(Decl::VarTemplate)); } TEST(WalkAST, FunctionTemplates) { // Explicit instantiation and (partial) specialization references primary diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index fcccac10f4733..e457694e4625d 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -8540,7 +8540,7 @@ class Sema final { /// if the arguments are dependent. ExprResult CheckVarTemplateId(const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, - VarTemplateDecl *Template, + VarTemplateDecl *Template, NamedDecl *FoundD, SourceLocation TemplateLoc, const TemplateArgumentListInfo *TemplateArgs); diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 1a975a8d0a0df..7d3d665194add 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4958,11 +4958,10 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc, return Decl; } -ExprResult -Sema::CheckVarTemplateId(const CXXScopeSpec &SS, - const DeclarationNameInfo &NameInfo, - VarTemplateDecl *Template, SourceLocation TemplateLoc, - const TemplateArgumentListInfo *TemplateArgs) { +ExprResult Sema::CheckVarTemplateId( + const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, + VarTemplateDecl *Template, NamedDecl *FoundD, SourceLocation TemplateLoc, + const TemplateArgumentListInfo *TemplateArgs) { DeclResult Decl = CheckVarTemplateId(Template, TemplateLoc, NameInfo.getLoc(), *TemplateArgs); @@ -4978,8 +4977,7 @@ Sema::CheckVarTemplateId(const CXXScopeSpec &SS, NameInfo.getLoc()); // Build an ordinary singleton decl ref. - return BuildDeclarationNameExpr(SS, NameInfo, Var, - /*FoundD=*/nullptr, TemplateArgs); + return BuildDeclarationNameExpr(SS, NameInfo, Var, FoundD, TemplateArgs); } void Sema::diagnoseMissingTemplateArguments(TemplateName Name, @@ -5066,9 +5064,9 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS, bool KnownDependent = false; // In C++1y, check variable template ids. if (R.getAsSingle()) { - ExprResult Res = CheckVarTemplateId(SS, R.getLookupNameInfo(), - R.getAsSingle(), - TemplateKWLoc, TemplateArgs); + ExprResult Res = CheckVarTemplateId( + SS, R.getLookupNameInfo(), R.getAsSingle(), + R.getRepresentativeDecl(), TemplateKWLoc, TemplateArgs); if (Res.isInvalid() || Res.isUsable()) return Res; // Result is dependent. Carry on to build an UnresolvedLookupEpxr. diff --git a/clang/test/AST/ast-dump-using.cpp b/clang/test/AST/ast-dump-using.cpp index 5a4e910ffb865..8e5c60d3aabf4 100644 --- a/clang/test/AST/ast-dump-using.cpp +++ b/clang/test/AST/ast-dump-using.cpp @@ -2,6 +2,7 @@ namespace a { struct S; +template T x = {}; } namespace b { using a::S; @@ -21,4 +22,10 @@ typedef S e; // check the same UsingType is reused. // CHECK-NEXT: `-UsingType [[TYPE_ADDR]] 'a::S' sugar // CHECK-NEXT: |-UsingShadow [[SHADOW_ADDR]] 'S' // CHECK-NEXT: `-RecordType {{.*}} 'a::S' +using a::x; + +void foo() { + x = 3; + // CHECK: DeclRefExpr {{.*}} 'x' {{.*}} (UsingShadow {{.*}} 'x') +} } From 4419b2c27fa45a08bc3892ad0c8c5eb95d96d608 Mon Sep 17 00:00:00 2001 From: Kadir Cetinkaya Date: Fri, 23 Feb 2024 11:38:00 +0100 Subject: [PATCH 331/351] [clangd] Make tidy-rename tests conditional --- .../clangd/unittests/ClangdLSPServerTests.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp index 555c4c5749981..75a140767035b 100644 --- a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp +++ b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp @@ -11,6 +11,7 @@ #include "ClangdServer.h" #include "ConfigProvider.h" #include "Diagnostics.h" +#include "Feature.h" #include "FeatureModule.h" #include "LSPBinder.h" #include "LSPClient.h" @@ -198,6 +199,9 @@ TEST_F(LSPTest, RecordsLatencies) { // clang-tidy's renames are converted to clangd's internal rename functionality, // see clangd#1589 and clangd#741 TEST_F(LSPTest, ClangTidyRename) { + // This test requires clang-tidy checks to be linked in. + if (!CLANGD_TIDY_CHECKS) + return; Annotations Header(R"cpp( void [[foo]](); )cpp"); @@ -214,7 +218,9 @@ TEST_F(LSPTest, ClangTidyRename) { Client.didOpen("foo.hpp", Header.code()); Client.didOpen("foo.cpp", Source.code()); - auto RenameDiag = Client.diagnostics("foo.cpp").value().at(0); + auto Diags = Client.diagnostics("foo.cpp"); + ASSERT_TRUE(Diags && !Diags->empty()); + auto RenameDiag = Diags->front(); auto RenameCommand = (*Client From de04b7d44edbfe8c2357cc291f8806575e6e93f2 Mon Sep 17 00:00:00 2001 From: Daniel Krupp Date: Fri, 23 Feb 2024 11:44:34 +0100 Subject: [PATCH 332/351] [analyzer] Fix core.VLASize checker false positive taint reports (#68140) The checker reported a false positive on this code void testTaintedSanitizedVLASize(void) { int x; scanf("%d", &x); if (x<1) return; int vla[x]; // no-warning } After the fix, the checker only emits tainted warning if the vla size is coming from a tainted source and it cannot prove that it is positive. --- clang/docs/analyzer/checkers.rst | 27 ++++++++++++++++--- .../Checkers/VLASizeChecker.cpp | 16 +++++------ .../test/Analysis/taint-diagnostic-visitor.c | 4 +-- clang/test/Analysis/taint-generic.c | 11 +++++++- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 510629d8a2d48..899622ae283b1 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -213,9 +213,8 @@ Check for undefined results of binary operators. core.VLASize (C) """""""""""""""" -Check for declarations of Variable Length Arrays of undefined or zero size. - - Check for declarations of VLA of undefined or zero size. +Check for declarations of Variable Length Arrays (VLA) of undefined, zero or negative +size. .. code-block:: c @@ -229,6 +228,28 @@ Check for declarations of Variable Length Arrays of undefined or zero size. int vla2[x]; // warn: zero size } + +The checker also gives warning if the `TaintPropagation` checker is switched on +and an unbound, attacker controlled (tainted) value is used to define +the size of the VLA. + +.. code-block:: c + + void taintedVLA(void) { + int x; + scanf("%d", &x); + int vla[x]; // Declared variable-length array (VLA) has tainted (attacker controlled) size, that can be 0 or negative + } + + void taintedVerfieidVLA(void) { + int x; + scanf("%d", &x); + if (x<1) + return; + int vla[x]; // no-warning. The analyzer can prove that x must be positive. + } + + .. _core-uninitialized-ArraySubscript: core.uninitialized.ArraySubscript (C) diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp index d76fe49918690..87d255eeffc17 100644 --- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp @@ -164,12 +164,6 @@ ProgramStateRef VLASizeChecker::checkVLAIndexSize(CheckerContext &C, if (SizeV.isUnknown()) return nullptr; - // Check if the size is tainted. - if (isTainted(State, SizeV)) { - reportTaintBug(SizeE, State, C, SizeV); - return nullptr; - } - // Check if the size is zero. DefinedSVal SizeD = SizeV.castAs(); @@ -192,10 +186,10 @@ ProgramStateRef VLASizeChecker::checkVLAIndexSize(CheckerContext &C, SVal LessThanZeroVal = SVB.evalBinOp(State, BO_LT, SizeD, Zero, SVB.getConditionType()); + ProgramStateRef StatePos, StateNeg; if (std::optional LessThanZeroDVal = LessThanZeroVal.getAs()) { ConstraintManager &CM = C.getConstraintManager(); - ProgramStateRef StatePos, StateNeg; std::tie(StateNeg, StatePos) = CM.assumeDual(State, *LessThanZeroDVal); if (StateNeg && !StatePos) { @@ -205,6 +199,12 @@ ProgramStateRef VLASizeChecker::checkVLAIndexSize(CheckerContext &C, State = StatePos; } + // Check if the size is tainted. + if ((StateNeg || StateZero) && isTainted(State, SizeV)) { + reportTaintBug(SizeE, State, C, SizeV); + return nullptr; + } + return State; } @@ -218,7 +218,7 @@ void VLASizeChecker::reportTaintBug(const Expr *SizeE, ProgramStateRef State, SmallString<256> buf; llvm::raw_svector_ostream os(buf); os << "Declared variable-length array (VLA) "; - os << "has tainted size"; + os << "has tainted (attacker controlled) size that can be 0 or negative"; auto report = std::make_unique(TaintBT, os.str(), N); report->addRange(SizeE->getSourceRange()); diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c index a3fa1639bffee..020e9579ac535 100644 --- a/clang/test/Analysis/taint-diagnostic-visitor.c +++ b/clang/test/Analysis/taint-diagnostic-visitor.c @@ -46,8 +46,8 @@ void taintDiagnosticVLA(void) { scanf("%d", &x); // expected-note {{Value assigned to 'x'}} // expected-note@-1 {{Taint originated here}} // expected-note@-2 {{Taint propagated to the 2nd argument}} - int vla[x]; // expected-warning {{Declared variable-length array (VLA) has tainted size}} - // expected-note@-1 {{Declared variable-length array (VLA) has tainted size}} + int vla[x]; // expected-warning {{Declared variable-length array (VLA) has tainted}} + // expected-note@-1 {{Declared variable-length array (VLA) has tainted}} } diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c index 4ff474b2ed40d..e85b4106a5806 100644 --- a/clang/test/Analysis/taint-generic.c +++ b/clang/test/Analysis/taint-generic.c @@ -405,7 +405,16 @@ int testDivByZero(void) { void testTaintedVLASize(void) { int x; scanf("%d", &x); - int vla[x]; // expected-warning{{Declared variable-length array (VLA) has tainted size}} + int vla[x]; // expected-warning{{Declared variable-length array (VLA) has tainted (attacker controlled) size that can be 0 or negative}} +} + +// Tainted-sanitized VLAs. +void testTaintedSanitizedVLASize(void) { + int x; + scanf("%d", &x); + if (x<1) + return; + int vla[x]; // no-warning } int testTaintedAllocaMem() { From 9dfb8430509619a4e9d36fd00a11b83a2d5d0c3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Fri, 23 Feb 2024 11:48:04 +0100 Subject: [PATCH 333/351] [include-cleaner] Use FoundDecl only for using-shadow-decls (#82615) --- .../include-cleaner/lib/WalkAST.cpp | 5 +++ .../include-cleaner/unittests/WalkASTTest.cpp | 34 +++++++++++-------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp index 6c4d9b7862d91..277e6ec5b0890 100644 --- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp +++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp @@ -128,6 +128,11 @@ class ASTWalker : public RecursiveASTVisitor { bool VisitDeclRefExpr(DeclRefExpr *DRE) { auto *FD = DRE->getFoundDecl(); + // Prefer the underlying decl if FoundDecl isn't a shadow decl, e.g: + // - For templates, found-decl is always primary template, but we want the + // specializaiton itself. + if (!llvm::isa(FD)) + FD = DRE->getDecl(); // For refs to non-meber-like decls, use the found decl. // For member-like decls, we should have a reference from the qualifier to // the container decl instead, which is preferred as it'll handle diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp index 0be5db36b1fc5..e238dc3d902bb 100644 --- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp +++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp @@ -200,24 +200,26 @@ TEST(WalkAST, VarTemplates) { EXPECT_THAT(testWalk(R"cpp( template T $explicit^Foo = 0;)cpp", "int z = ^Foo;"), - ElementsAre(Decl::VarTemplate)); + ElementsAre(Decl::VarTemplateSpecialization)); EXPECT_THAT(testWalk(R"cpp( - template T $explicit^Foo = 0; - template<> int Foo = 1;)cpp", + template T Foo = 0; + template<> int $explicit^Foo = 1;)cpp", "int x = ^Foo;"), - ElementsAre(Decl::VarTemplate)); + ElementsAre(Decl::VarTemplateSpecialization)); // FIXME: This points at implicit specialization, instead we should point to // explicit partial specializaiton pattern. EXPECT_THAT(testWalk(R"cpp( - template T $explicit^Foo = 0; - template T* Foo = nullptr;)cpp", + template T Foo = 0; + template T* $explicit^Foo = nullptr;)cpp", "int *x = ^Foo;"), - ElementsAre(Decl::VarTemplate)); + ElementsAre(Decl::VarTemplateSpecialization)); + // Implicit specializations through explicit instantiations has source + // locations pointing at the primary template. EXPECT_THAT(testWalk(R"cpp( template T $explicit^Foo = 0; template int Foo;)cpp", "int x = ^Foo;"), - ElementsAre(Decl::VarTemplate)); + ElementsAre(Decl::VarTemplateSpecialization)); } TEST(WalkAST, FunctionTemplates) { // Explicit instantiation and (partial) specialization references primary @@ -239,18 +241,19 @@ TEST(WalkAST, FunctionTemplates) { EXPECT_THAT(testWalk(R"cpp( template void $explicit^foo() {})cpp", "auto x = []{ ^foo(); };"), - ElementsAre(Decl::FunctionTemplate)); - // FIXME: DeclRefExpr points at primary template, not the specialization. + ElementsAre(Decl::Function)); EXPECT_THAT(testWalk(R"cpp( - template void $explicit^foo() {} - template<> void foo(){})cpp", + template void foo() {} + template<> void $explicit^foo(){})cpp", "auto x = []{ ^foo(); };"), - ElementsAre(Decl::FunctionTemplate)); + ElementsAre(Decl::Function)); + // The decl is actually the specialization, but explicit instantations point + // at the primary template. EXPECT_THAT(testWalk(R"cpp( template void $explicit^foo() {}; template void foo();)cpp", "auto x = [] { ^foo(); };"), - ElementsAre(Decl::FunctionTemplate)); + ElementsAre(Decl::Function)); } TEST(WalkAST, TemplateSpecializationsFromUsingDecl) { // Class templates @@ -548,7 +551,8 @@ TEST(WalkAST, Concepts) { testWalk(Concept, "template void func() requires ^Foo {}"); testWalk(Concept, "void func(^Foo auto x) {}"); // FIXME: Foo should be explicitly referenced. - testWalk("template concept Foo = true;", "void func() { ^Foo auto x = 1; }"); + testWalk("template concept Foo = true;", + "void func() { ^Foo auto x = 1; }"); } TEST(WalkAST, FriendDecl) { From 7bb08ee8260c825eb5af4824bc62f73155b4b592 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 23 Feb 2024 11:55:24 +0100 Subject: [PATCH 334/351] [mlir][Transforms][NFC] Decouple `ConversionPatternRewriterImpl` from `ConversionPatternRewriter` (#82333) `ConversionPatternRewriterImpl` no longer maintains a reference to the respective `ConversionPatternRewriter`. An `MLIRContext` is sufficient. This commit simplifies the internal state of `ConversionPatternRewriterImpl`. --- .../Transforms/Utils/DialectConversion.cpp | 44 +++++++++---------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 508ee7416d55d..d015bd5290123 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -756,10 +756,9 @@ static RewriteTy *findSingleRewrite(R &&rewrites, Block *block) { namespace mlir { namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { - explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter, + explicit ConversionPatternRewriterImpl(MLIRContext *ctx, const ConversionConfig &config) - : rewriter(rewriter), eraseRewriter(rewriter.getContext()), - config(config) {} + : eraseRewriter(ctx), config(config) {} //===--------------------------------------------------------------------===// // State Management @@ -854,8 +853,8 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { Type origOutputType, const TypeConverter *converter); - Value buildUnresolvedArgumentMaterialization(PatternRewriter &rewriter, - Location loc, ValueRange inputs, + Value buildUnresolvedArgumentMaterialization(Block *block, Location loc, + ValueRange inputs, Type origOutputType, Type outputType, const TypeConverter *converter); @@ -934,8 +933,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { // State //===--------------------------------------------------------------------===// - PatternRewriter &rewriter; - /// This rewriter must be used for erasing ops/blocks. SingleEraseRewriter eraseRewriter; @@ -1037,8 +1034,12 @@ void BlockTypeConversionRewrite::rollback() { LogicalResult BlockTypeConversionRewrite::materializeLiveConversions( function_ref findLiveUser) { + auto builder = OpBuilder::atBlockBegin(block, /*listener=*/&rewriterImpl); + // Process the remapping for each of the original arguments. for (auto it : llvm::enumerate(origBlock->getArguments())) { + OpBuilder::InsertionGuard g(builder); + // If the type of this argument changed and the argument is still live, we // need to materialize a conversion. BlockArgument origArg = it.value(); @@ -1050,14 +1051,12 @@ LogicalResult BlockTypeConversionRewrite::materializeLiveConversions( Value replacementValue = rewriterImpl.mapping.lookupOrDefault(origArg); bool isDroppedArg = replacementValue == origArg; - if (isDroppedArg) - rewriterImpl.rewriter.setInsertionPointToStart(getBlock()); - else - rewriterImpl.rewriter.setInsertionPointAfterValue(replacementValue); + if (!isDroppedArg) + builder.setInsertionPointAfterValue(replacementValue); Value newArg; if (converter) { newArg = converter->materializeSourceConversion( - rewriterImpl.rewriter, origArg.getLoc(), origArg.getType(), + builder, origArg.getLoc(), origArg.getType(), isDroppedArg ? ValueRange() : ValueRange(replacementValue)); assert((!newArg || newArg.getType() == origArg.getType()) && "materialization hook did not provide a value of the expected " @@ -1322,6 +1321,8 @@ LogicalResult ConversionPatternRewriterImpl::convertNonEntryRegionTypes( Block *ConversionPatternRewriterImpl::applySignatureConversion( Block *block, const TypeConverter *converter, TypeConverter::SignatureConversion &signatureConversion) { + MLIRContext *ctx = block->getParentOp()->getContext(); + // If no arguments are being changed or added, there is nothing to do. unsigned origArgCount = block->getNumArguments(); auto convertedTypes = signatureConversion.getConvertedTypes(); @@ -1338,7 +1339,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( // Map all new arguments to the location of the argument they originate from. SmallVector newLocs(convertedTypes.size(), - rewriter.getUnknownLoc()); + Builder(ctx).getUnknownLoc()); for (unsigned i = 0; i < origArgCount; ++i) { auto inputMap = signatureConversion.getInputMapping(i); if (!inputMap || inputMap->replacementValue) @@ -1357,8 +1358,6 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( SmallVector, 1> argInfo; argInfo.resize(origArgCount); - OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToStart(newBlock); for (unsigned i = 0; i != origArgCount; ++i) { auto inputMap = signatureConversion.getInputMapping(i); if (!inputMap) @@ -1401,7 +1400,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( outputType = legalOutputType; newArg = buildUnresolvedArgumentMaterialization( - rewriter, origArg.getLoc(), replArgs, origOutputType, outputType, + newBlock, origArg.getLoc(), replArgs, origOutputType, outputType, converter); } @@ -1439,12 +1438,11 @@ Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization( return convertOp.getResult(0); } Value ConversionPatternRewriterImpl::buildUnresolvedArgumentMaterialization( - PatternRewriter &rewriter, Location loc, ValueRange inputs, - Type origOutputType, Type outputType, const TypeConverter *converter) { - return buildUnresolvedMaterialization( - MaterializationKind::Argument, rewriter.getInsertionBlock(), - rewriter.getInsertionPoint(), loc, inputs, outputType, origOutputType, - converter); + Block *block, Location loc, ValueRange inputs, Type origOutputType, + Type outputType, const TypeConverter *converter) { + return buildUnresolvedMaterialization(MaterializationKind::Argument, block, + block->begin(), loc, inputs, outputType, + origOutputType, converter); } Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization( Location loc, Value input, Type outputType, @@ -1556,7 +1554,7 @@ void ConversionPatternRewriterImpl::notifyMatchFailure( ConversionPatternRewriter::ConversionPatternRewriter( MLIRContext *ctx, const ConversionConfig &config) : PatternRewriter(ctx), - impl(new detail::ConversionPatternRewriterImpl(*this, config)) { + impl(new detail::ConversionPatternRewriterImpl(ctx, config)) { setListener(impl.get()); } From 404854ee2018489c15c3454857d92e3bab7c1672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 23 Feb 2024 11:19:30 +0100 Subject: [PATCH 335/351] [clang][Interp][NFC] Print global variable initialization state --- clang/lib/AST/Interp/Disasm.cpp | 8 ++++++++ clang/lib/AST/Interp/Program.cpp | 2 +- clang/lib/AST/Interp/Program.h | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index 3bc9312debeb7..315ddb293044b 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -101,6 +101,14 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const { for (const Global *G : Globals) { const Descriptor *Desc = G->block()->getDescriptor(); OS << GI << ": " << (void *)G->block() << " "; + { + Pointer GP = getPtrGlobal(GI); + ColorScope SC(OS, true, + GP.isInitialized() + ? TerminalColor{llvm::raw_ostream::GREEN, false} + : TerminalColor{llvm::raw_ostream::RED, false}); + OS << (GP.isInitialized() ? "initialized " : "uninitialized "); + } Desc->dump(OS); OS << "\n"; ++GI; diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp index 61293a3fef470..86e18ede63811 100644 --- a/clang/lib/AST/Interp/Program.cpp +++ b/clang/lib/AST/Interp/Program.cpp @@ -102,7 +102,7 @@ unsigned Program::createGlobalString(const StringLiteral *S) { return I; } -Pointer Program::getPtrGlobal(unsigned Idx) { +Pointer Program::getPtrGlobal(unsigned Idx) const { assert(Idx < Globals.size()); return Pointer(Globals[Idx]->block()); } diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h index 7922eafbeb2d0..045bf7ab7745b 100644 --- a/clang/lib/AST/Interp/Program.h +++ b/clang/lib/AST/Interp/Program.h @@ -67,7 +67,7 @@ class Program final { unsigned createGlobalString(const StringLiteral *S); /// Returns a pointer to a global. - Pointer getPtrGlobal(unsigned Idx); + Pointer getPtrGlobal(unsigned Idx) const; /// Returns the value of a global. Block *getGlobal(unsigned Idx) { From e7c60915e61912fb24707dc67e6c4fc919515796 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 23 Feb 2024 12:01:30 +0100 Subject: [PATCH 336/351] Remove duplicated REQUIRES: asserts --- llvm/test/Transforms/LoopVectorize/X86/pr72969.ll | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll index 738f5cbaebea5..f982695983330 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll @@ -1,7 +1,6 @@ ; REQUIRES: asserts ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s -; REQUIRES: asserts @h = global i64 0 From 790bcecce6c135476d2551805c09ed670b9f8418 Mon Sep 17 00:00:00 2001 From: Evgenii Kudriashov Date: Fri, 23 Feb 2024 12:11:50 +0100 Subject: [PATCH 337/351] [GlobalISel] Fix a check that aligned tail call is lowered (#82016) Despite of a valid tail call opportunity, backends still may not generate a tail call or such lowering is not implemented yet. Check that lowering has happened instead of its possibility when generating G_ASSERT_ALIGN. --- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 2 +- .../X86/GlobalISel/calllowering-tailcall.ll | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 3bd1542eeb746..77dc265d795d0 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -187,7 +187,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, if (!lowerCall(MIRBuilder, Info)) return false; - if (ReturnHintAlignReg && !Info.IsTailCall) { + if (ReturnHintAlignReg && !Info.LoweredTailCall) { MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg, ReturnHintAlign); } diff --git a/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll new file mode 100644 index 0000000000000..6a856c32eb261 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X64 +; RUN: llc -mtriple=i686-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X86 + +declare ptr @foo() + +define ptr @aligned_tailcall() nounwind { +; X64-LABEL: aligned_tailcall: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: callq foo +; X64-NEXT: popq %rcx +; X64-NEXT: retq +; +; X86-LABEL: aligned_tailcall: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: calll foo +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +entry: + %call = tail call align 8 ptr @foo() + ret ptr %call +} From 22734e15d8f2c437e8543f19632299d2e09b31f3 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 23 Feb 2024 11:31:24 +0000 Subject: [PATCH 338/351] [Clang][AArch64] Fix 'svzero_za' intrinsic to take no arguments. (#82648) We previously defined svzero_za as: void svzero_za(); rather than: void svzero_za(void); Which meant that Clang accepted arguments. Compiling for example `svzero_za()` ended up with incorrect IR and a compiler crash because it couldn't select an instruction for it. --- clang/include/clang/Basic/arm_sme.td | 2 +- clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 2da0e8d2aba9a..1ac6d5170ea28 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -142,7 +142,7 @@ let TargetGuard = "sme" in { def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero", [IsOverloadNone, IsStreamingCompatible, IsInOutZA], [ImmCheck<0, ImmCheck0_255>]>; - def SVZERO_ZA : SInst<"svzero_za", "v", "", MergeNone, "aarch64_sme_zero", + def SVZERO_ZA : SInst<"svzero_za", "vv", "", MergeNone, "aarch64_sme_zero", [IsOverloadNone, IsStreamingCompatible, IsOutZA]>; } diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c new file mode 100644 index 0000000000000..e0b6c391d9890 --- /dev/null +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fsyntax-only -verify %s + +void test_svzero_args(uint64_t m) { + svzero_za(0); // expected-error {{too many arguments to function call, expected 0, have 1}} + svzero_za(m); // expected-error {{too many arguments to function call, expected 0, have 1}} + svzero_mask_za(m); // expected-error {{argument to 'svzero_mask_za' must be a constant integer}} +} From 3c90fce4504e22953ec5586599afaecfb2923a9e Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 23 Feb 2024 11:31:53 +0000 Subject: [PATCH 339/351] [Clang][AArch64] Add missing prototypes for streaming-compatible routines (#82649) --- .../acle_sme_state_funs.c | 59 ++++++++++++++++++- clang/utils/TableGen/SveEmitter.cpp | 6 ++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c index dc07efbb81603..e80a965394e7f 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c @@ -28,12 +28,12 @@ bool test_in_streaming_mode(void) __arm_streaming_compatible { // CHECK-LABEL: @test_za_disable( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]] +// CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR3]] // CHECK-NEXT: ret void // // CPP-CHECK-LABEL: @_Z15test_za_disablev( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]] +// CPP-CHECK-NEXT: tail call void @__arm_za_disable() #[[ATTR3]] // CPP-CHECK-NEXT: ret void // void test_za_disable(void) __arm_streaming_compatible { @@ -70,3 +70,58 @@ void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") { svundef_za(); } +// CHECK-LABEL: @test_sc_memcpy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z14test_sc_memcpyPvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible { + return __arm_sc_memcpy(dest, src, n); +} + +// CHECK-LABEL: @test_sc_memmove( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z15test_sc_memmovePvPKvm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible { + return __arm_sc_memmove(dest, src, n); +} + +// CHECK-LABEL: @test_sc_memset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z14test_sc_memsetPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible { + return __arm_sc_memset(s, c, n); +} + +// CHECK-LABEL: @test_sc_memchr( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CHECK-NEXT: ret ptr [[CALL]] +// +// CPP-CHECK-LABEL: @_Z14test_sc_memchrPvim( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]] +// CPP-CHECK-NEXT: ret ptr [[CALL]] +// +void *test_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible { + return __arm_sc_memchr(s, c, n); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 174304f09007b..131397e3825b0 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1579,6 +1579,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << "#endif\n"; OS << "#include \n\n"; + OS << "#include \n\n"; OS << "/* Function attributes */\n"; OS << "#define __ai static __inline__ __attribute__((__always_inline__, " @@ -1605,6 +1606,11 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) { OS << " return x0 & 1;\n"; OS << "}\n\n"; + OS << "void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n"; + OS << "void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n"; + OS << "void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;\n"; + OS << "void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible;\n\n"; + OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) " "__arm_streaming_compatible __arm_out(\"za\") " "{ }\n\n"; From 8a164220207b579c31d6aa6552944441c83e9465 Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Fri, 23 Feb 2024 11:37:21 +0000 Subject: [PATCH 340/351] [RemoveDIs] Add DPLabels support [3a/3] (#82633) Patch 2 of 3 to add llvm.dbg.label support to the RemoveDIs project. The patch stack adds the DPLabel class, which is the RemoveDIs llvm.dbg.label equivalent. 1. Add DbgRecord base class for DPValue and the not-yet-added DPLabel class. 2. Add the DPLabel class. -> 3. Add support to passes. The next patch, #82639, will enable conversion between dbg.labels and DPLabels. AssignemntTrackingAnalysis support could have gone two ways: 1. Have the analysis store a DPLabel representation in its results - SelectionDAGBuilder reads the analysis results and ignores all DbgRecord kinds. 2. Ignore DPLabels in the analysis - SelectionDAGBuilder reads the analysis results but still needs to iterate over DPLabels from the IR. I went with option 2 because it's less work and is no less correct than 1. It's worth noting that causes labels to sink to the bottom of packs of debug records. e.g., [value, label, value] becomes [value, value, label]. This shouldn't be a problem because labels and variable locations don't have an ordering requirement. The ordering between variable locations is maintained and the label movement is deterministic --- .../include/llvm/IR/DebugProgramInstruction.h | 10 ++--- llvm/include/llvm/IR/IntrinsicInst.h | 3 ++ .../CodeGen/AssignmentTrackingAnalysis.cpp | 9 ++-- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 12 ++++- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 17 ++++++- .../SelectionDAG/SelectionDAGBuilder.cpp | 29 ++++++++---- llvm/lib/IR/AsmWriter.cpp | 4 +- .../Scalar/SpeculativeExecution.cpp | 6 +-- llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 10 ++++- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 45 ++++++++++++------- .../Transforms/Utils/MemoryTaggingSupport.cpp | 3 +- llvm/lib/Transforms/Utils/ValueMapper.cpp | 5 +++ .../SpeculativeExecution/PR46267.ll | 5 +++ 13 files changed, 114 insertions(+), 44 deletions(-) diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index 1c8619741eb69..84b0f743d3c9b 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -157,6 +157,11 @@ class DbgRecord : public ilist_node { ~DbgRecord() = default; }; +inline raw_ostream &operator<<(raw_ostream &OS, const DbgRecord &R) { + R.print(OS); + return OS; +} + /// Records a position in IR for a source label (DILabel). Corresponds to the /// llvm.dbg.label intrinsic. /// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord. @@ -536,11 +541,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DPMarker &Marker) { return OS; } -inline raw_ostream &operator<<(raw_ostream &OS, const DPValue &Value) { - Value.print(OS); - return OS; -} - /// Inline helper to return a range of DPValues attached to a marker. It needs /// to be inlined as it's frequently called, but also come after the declaration /// of DPMarker. Thus: it's pre-declared by users like Instruction, then an diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index b8d578d0fee08..fbaaef8ea4431 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -531,6 +531,9 @@ class DbgAssignIntrinsic : public DbgValueInst { class DbgLabelInst : public DbgInfoIntrinsic { public: DILabel *getLabel() const { return cast(getRawLabel()); } + void setLabel(DILabel *NewLabel) { + setArgOperand(0, MetadataAsValue::get(getContext(), NewLabel)); + } Metadata *getRawLabel() const { return cast(getArgOperand(0))->getMetadata(); diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index 7b66a851db252..3b84624c3d4dc 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -829,11 +829,7 @@ class MemLocFragmentFill { void process(BasicBlock &BB, VarFragMap &LiveSet) { BBInsertBeforeMap[&BB].clear(); for (auto &I : BB) { - for (DbgRecord &DR : I.getDbgValueRange()) { - // FIXME: DPValue::filter usage needs attention in this file; we need - // to make sure dbg.labels are handled correctly in RemoveDIs mode. - // Cast below to ensure this gets fixed when DPLabels are introduced. - DPValue &DPV = cast(DR); + for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) { if (const auto *Locs = FnVarLocs->getWedge(&DPV)) { for (const VarLocInfo &Loc : *Locs) { addDef(Loc, &DPV, *I.getParent(), LiveSet); @@ -1919,6 +1915,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) { // attached DPValues, or a non-debug instruction with attached unprocessed // DPValues. if (II != EI && II->hasDbgValues()) { + // Skip over non-variable debug records (i.e., labels). They're going to + // be read from IR (possibly re-ordering them within the debug record + // range) rather than from the analysis results. for (DPValue &DPV : DPValue::filter(II->getDbgValueRange())) { resetInsertionPoint(DPV); processDPValue(DPV, LiveSet); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 7c95cef2eeb76..38bb808dd5bd5 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3275,7 +3275,17 @@ void IRTranslator::translateDbgDeclareRecord(Value *Address, bool HasArgList, void IRTranslator::translateDbgInfo(const Instruction &Inst, MachineIRBuilder &MIRBuilder) { - for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) { + for (DbgRecord &DR : Inst.getDbgValueRange()) { + if (DPLabel *DPL = dyn_cast(&DR)) { + MIRBuilder.setDebugLoc(DPL->getDebugLoc()); + assert(DPL->getLabel() && "Missing label"); + assert(DPL->getLabel()->isValidLocationForIntrinsic( + MIRBuilder.getDebugLoc()) && + "Expected inlined-at fields to agree"); + MIRBuilder.buildDbgLabel(DPL->getLabel()); + continue; + } + DPValue &DPV = cast(DR); const DILocalVariable *Variable = DPV.getVariable(); const DIExpression *Expression = DPV.getExpression(); Value *V = DPV.getVariableLocationOp(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 5651498dd3f5a..246762dd7ab62 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1188,11 +1188,24 @@ void FastISel::handleDbgInfo(const Instruction *II) { MIMD = MIMetadata(); // Reverse order of debug records, because fast-isel walks through backwards. - for (DbgRecord &DPR : llvm::reverse(II->getDbgValueRange())) { + for (DbgRecord &DR : llvm::reverse(II->getDbgValueRange())) { flushLocalValueMap(); recomputeInsertPt(); - DPValue &DPV = cast(DPR); + if (DPLabel *DPL = dyn_cast(&DR)) { + assert(DPL->getLabel() && "Missing label"); + if (!FuncInfo.MF->getMMI().hasDebugInfo()) { + LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DPL << "\n"); + continue; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DPL->getDebugLoc(), + TII.get(TargetOpcode::DBG_LABEL)) + .addMetadata(DPL->getLabel()); + continue; + } + + DPValue &DPV = cast(DR); Value *V = nullptr; if (!DPV.hasArgList()) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e893a5b616d33..ee600d389c2cc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1241,17 +1241,30 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) { It->Expr, Vals.size() > 1, It->DL, SDNodeOrder); } } - // We must early-exit here to prevent any DPValues from being emitted below, - // as we have just emitted the debug values resulting from assignment - // tracking analysis, making any existing DPValues redundant (and probably - // less correct). - return; } + // We must skip DPValues if they've already been processed above as we + // have just emitted the debug values resulting from assignment tracking + // analysis, making any existing DPValues redundant (and probably less + // correct). We still need to process DPLabels. This does sink DPLabels + // to the bottom of the group of debug records. That sholdn't be important + // as it does so deterministcally and ordering between DPLabels and DPValues + // is immaterial (other than for MIR/IR printing). + bool SkipDPValues = DAG.getFunctionVarLocs(); // Is there is any debug-info attached to this instruction, in the form of - // DPValue non-instruction debug-info records. - for (DbgRecord &DPR : I.getDbgValueRange()) { - DPValue &DPV = cast(DPR); + // DbgRecord non-instruction debug-info records. + for (DbgRecord &DR : I.getDbgValueRange()) { + if (DPLabel *DPL = dyn_cast(&DR)) { + assert(DPL->getLabel() && "Missing label"); + SDDbgLabel *SDV = + DAG.getDbgLabel(DPL->getLabel(), DPL->getDebugLoc(), SDNodeOrder); + DAG.AddDbgLabel(SDV); + continue; + } + + if (SkipDPValues) + continue; + DPValue &DPV = cast(DR); DILocalVariable *Variable = DPV.getVariable(); DIExpression *Expression = DPV.getExpression(); dropDanglingDebugInfo(Variable, Expression); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index c2a470c5fc716..fba404c9b027c 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1141,12 +1141,14 @@ void SlotTracker::processFunctionMetadata(const Function &F) { void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) { if (const DPValue *DPV = dyn_cast(&DR)) { CreateMetadataSlot(DPV->getVariable()); - CreateMetadataSlot(DPV->getDebugLoc()); if (DPV->isDbgAssign()) CreateMetadataSlot(DPV->getAssignID()); + } else if (const DPLabel *DPL = dyn_cast(&DR)) { + CreateMetadataSlot(DPL->getLabel()); } else { llvm_unreachable("unsupported DbgRecord kind"); } + CreateMetadataSlot(DR.getDebugLoc()); } void SlotTracker::processInstructionMetadata(const Instruction &I) { diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index f4f3070d11c7b..260f31b59ed29 100644 --- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -291,9 +291,9 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( InstructionCost TotalSpeculationCost = 0; unsigned NotHoistedInstCount = 0; for (const auto &I : FromBlock) { - // Make note of any DPValues that need hoisting. - for (DbgRecord &DR : I.getDbgValueRange()) { - DPValue &DPV = cast(DR); + // Make note of any DPValues that need hoisting. DPLabels + // get left behind just like llvm.dbg.labels. + for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) { if (HasNoUnhoistedInstr(DPV.location_ops())) DPValuesToHoist[DPV.getInstruction()].push_back(&DPV); } diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 7fd6759a61fba..5bb109a04ff17 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -386,7 +386,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) { SmallVector ToBeRemoved; SmallDenseSet VariableSet; for (auto &I : reverse(*BB)) { - for (DPValue &DPV : reverse(DPValue::filter(I.getDbgValueRange()))) { + for (DbgRecord &DR : reverse(I.getDbgValueRange())) { + if (isa(DR)) { + // Emulate existing behaviour (see comment below for dbg.declares). + // FIXME: Don't do this. + VariableSet.clear(); + continue; + } + + DPValue &DPV = cast(DR); // Skip declare-type records, as the debug intrinsic method only works // on dbg.value intrinsics. if (DPV.getType() == DPValue::LocationType::Declare) { diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 8ebcf0c04fd5a..bab065153f3ef 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1585,8 +1585,30 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, return cast(NewVar); }; - auto UpdateDPValuesOnInst = [&](Instruction &I) -> void { - for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) { + auto UpdateDbgLabel = [&](auto *LabelRecord) { + // Point the label record to a fresh label within the new function if + // the record was not inlined from some other function. + if (LabelRecord->getDebugLoc().getInlinedAt()) + return; + DILabel *OldLabel = LabelRecord->getLabel(); + DINode *&NewLabel = RemappedMetadata[OldLabel]; + if (!NewLabel) { + DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram( + *OldLabel->getScope(), *NewSP, Ctx, Cache); + NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(), + OldLabel->getFile(), OldLabel->getLine()); + } + LabelRecord->setLabel(cast(NewLabel)); + }; + + auto UpdateDbgRecordsOnInst = [&](Instruction &I) -> void { + for (DbgRecord &DR : I.getDbgValueRange()) { + if (DPLabel *DPL = dyn_cast(&DR)) { + UpdateDbgLabel(DPL); + continue; + } + + DPValue &DPV = cast(DR); // Apply the two updates that dbg.values get: invalid operands, and // variable metadata fixup. if (any_of(DPV.location_ops(), IsInvalidLocation)) { @@ -1599,13 +1621,11 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, } if (!DPV.getDebugLoc().getInlinedAt()) DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable())); - DPV.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DPV.getDebugLoc(), - *NewSP, Ctx, Cache)); } }; for (Instruction &I : instructions(NewFunc)) { - UpdateDPValuesOnInst(I); + UpdateDbgRecordsOnInst(I); auto *DII = dyn_cast(&I); if (!DII) @@ -1614,17 +1634,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, // Point the intrinsic to a fresh label within the new function if the // intrinsic was not inlined from some other function. if (auto *DLI = dyn_cast(&I)) { - if (DLI->getDebugLoc().getInlinedAt()) - continue; - DILabel *OldLabel = DLI->getLabel(); - DINode *&NewLabel = RemappedMetadata[OldLabel]; - if (!NewLabel) { - DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram( - *OldLabel->getScope(), *NewSP, Ctx, Cache); - NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(), - OldLabel->getFile(), OldLabel->getLine()); - } - DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel)); + UpdateDbgLabel(DLI); continue; } @@ -1658,6 +1668,9 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, if (const DebugLoc &DL = I.getDebugLoc()) I.setDebugLoc( DebugLoc::replaceInlinedAtSubprogram(DL, *NewSP, Ctx, Cache)); + for (DbgRecord &DR : I.getDbgValueRange()) + DR.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DR.getDebugLoc(), + *NewSP, Ctx, Cache)); // Loop info metadata may contain line locations. Fix them up. auto updateLoopInfoLoc = [&Ctx, &Cache, NewSP](Metadata *MD) -> Metadata * { diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 08fdd3b75ffcb..2ff7c01510767 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -111,8 +111,7 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) { void StackInfoBuilder::visit(Instruction &Inst) { // Visit non-intrinsic debug-info records attached to Inst. - for (DbgRecord &DR : Inst.getDbgValueRange()) { - DPValue &DPV = cast(DR); + for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) { auto AddIfInteresting = [&](Value *V) { if (auto *AI = dyn_cast_or_null(V)) { if (!isInterestingAlloca(*AI)) diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index 6e46469f5a601..91ab2795a4b9d 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -538,6 +538,11 @@ Value *Mapper::mapValue(const Value *V) { } void Mapper::remapDPValue(DbgRecord &DR) { + if (DPLabel *DPL = dyn_cast(&DR)) { + DPL->setLabel(cast(mapMetadata(DPL->getLabel()))); + return; + } + DPValue &V = cast(DR); // Remap variables and DILocations. auto *MappedVar = mapMetadata(V.getVariable()); diff --git a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll index c27b492b4b876..d940ee6a7863d 100644 --- a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll +++ b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll @@ -41,12 +41,16 @@ land.rhs: ; preds = %entry ; CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %y ; CHECK-NEXT: %a0 = load i32, ptr undef, align 1 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %a0 +; CHECK-NEXT: call void @llvm.dbg.label call void @llvm.dbg.label(metadata !11), !dbg !10 %y = alloca i32, align 4 call void @llvm.dbg.declare(metadata ptr %y, metadata !14, metadata !DIExpression()), !dbg !10 %a0 = load i32, ptr undef, align 1 call void @llvm.dbg.value(metadata i32 %a0, metadata !9, metadata !DIExpression()), !dbg !10 + ;; RemoveDIs: Check a label that is attached to a hoisted instruction + ;; gets left behind (match intrinsic-style debug info behaviour). + call void @llvm.dbg.label(metadata !15), !dbg !10 %a2 = add i32 %i, 0 call void @llvm.dbg.value(metadata i32 %a2, metadata !13, metadata !DIExpression()), !dbg !10 @@ -82,3 +86,4 @@ attributes #1 = { nounwind readnone speculatable willreturn } !12 = !DILocalVariable(name: "x", scope: !6, file: !1, line: 3, type: !4) !13 = !DILocalVariable(name: "a2", scope: !6, file: !1, line: 3, type: !4) !14 = !DILocalVariable(name: "y", scope: !6, file: !1, line: 3, type: !4) +!15 = !DILabel(scope: !6, name: "label2", file: !1, line: 2) From cdf19d13bf39f0679c3636eada87a5645f9a4c84 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 23 Feb 2024 11:43:28 +0000 Subject: [PATCH 341/351] [Clang] Fix acle_sme_zero.c (missing aarch64-registered-target) This test was added in #82648 --- clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c index e0b6c391d9890..8ea80bc6568fe 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c @@ -1,3 +1,4 @@ +// REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fsyntax-only -verify %s void test_svzero_args(uint64_t m) { From e1326434742980b03433464dd9435ea66ad5be47 Mon Sep 17 00:00:00 2001 From: tsitdikov Date: Fri, 23 Feb 2024 11:47:40 +0000 Subject: [PATCH 342/351] Add build rule for MLIRArmSMETestPasses MLIRArmSMETestPasses was added in https://github.com/llvm/llvm-project/commit/b39f5660a408b47307e57a0882eb8af85d72e283, we need to add a build rule for it as well. --- .../llvm-project-overlay/mlir/test/BUILD.bazel | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index c3bc3f196c55d..497256573dfc5 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -905,6 +905,23 @@ cc_library( ], ) +cc_library( + name = "TestArmSME", + srcs = glob(["lib/Dialect/ArmSME/*.cpp"]), + defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"], + includes = ["lib/Dialect/Test"], + deps = [ + "//mlir:ArithToArmSME", + "//mlir:ArmSMEToLLVM", + "//mlir:ArmSMEToSCF", + "//mlir:IR", + "//mlir:Pass", + "//mlir:Transforms", + "//mlir:VectorToArmSME", + "//mlir:VectorToSCF", + ], +) + cc_library( name = "TestBufferization", srcs = glob(["lib/Dialect/Bufferization/*.cpp"]), From 3dfca24dda1b3596685d02109185ea2885cc0124 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 23 Feb 2024 03:50:00 -0800 Subject: [PATCH 343/351] [AMDGPU] Fix encoding of VOP3P dpp on GFX11 and GFX12 (#82710) The bug affects dpp forms of v_dot2_f32_f16. The encoding does not match SP3 and does not set op_sel_hi bits properly. --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 ++ llvm/lib/Target/AMDGPU/VOPInstructions.td | 1 + llvm/test/MC/AMDGPU/gfx11-promotions.s | 8 ++++---- llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s | 4 ++-- llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s | 2 +- llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s | 4 ++-- llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s | 2 +- .../MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt | 8 ++++---- .../test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt | 4 ++-- .../MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt | 8 ++++---- .../test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt | 4 ++-- 11 files changed, 25 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index cf76de40aef41..ac3c8f95306bc 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1353,6 +1353,7 @@ class VOP3P_DPP16 op, VOP_DPP_Pseudo ps, int subtarget, let AssemblerPredicate = HasDPP16; let SubtargetPredicate = HasDPP16; let OtherPredicates = ps.OtherPredicates; + let IsPacked = ps.IsPacked; } class VOP3P_DPP8_Base op, VOP_Pseudo ps, string opName = ps.OpName> @@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base op, VOP_Pseudo ps, string opName = ps.OpName> let SchedRW = ps.SchedRW; let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; + let IsPacked = ps.IsPacked; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 2989d05e968ef..80d7d96a5e3cc 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -818,6 +818,7 @@ class VOP_DPP_Pseudo pattern=[], let VALU = 1; let DPP = 1; let Size = 8; + let IsPacked = P.IsPacked; let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s index 0bd90266457ee..67e7beaa262f4 100644 --- a/llvm/test/MC/AMDGPU/gfx11-promotions.s +++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s @@ -337,17 +337,17 @@ v_dot2_f32_f16_e64 v0, v1, v2, v3 //===----------------------------------------------------------------------===// v_dot2_f32_f16 v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05] v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05] //===----------------------------------------------------------------------===// // VOP3P.DPP16. //===----------------------------------------------------------------------===// v_dot2_f32_f16 v0, v1, v2, v3 quad_perm:[1,2,3,0] -// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff] v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] -// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s index 2cfb8abd4e979..3ff4ed27f1b25 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s @@ -2,10 +2,10 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1 -// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x04,0xff] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff] v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe -// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe] +// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe] v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0 // GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s index 2656ba0cf1807..3fb993dc8bec4 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s @@ -15,4 +15,4 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8: // GFX11: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX11: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05] +// GFX11: encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s index 75bd1696e10bb..a6360684f1d0e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s @@ -2,10 +2,10 @@ // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1 -// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x04,0xff] +// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff] v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe -// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe] +// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe] v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0 // GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s index 14cf169d4b424..299339339e8c6 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s @@ -15,7 +15,7 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8: // GFX12: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92] v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] -// GFX12: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05] +// GFX12: encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05] v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] // GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x24,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt index 6b230367c8313..ceca6d9fc3faa 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt @@ -1,11 +1,11 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s -# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe] -0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe +# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe] +0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe -# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff] -0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff +# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff] +0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff # GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1] 0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt index 89c9b54d7cfee..57c96170eadce 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt @@ -1,8 +1,8 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s -# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05] -0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05 +# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05] +0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05 # GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] 0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt index 52fd0530681cf..10f438465d65e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt @@ -1,11 +1,11 @@ # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe] -0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe +# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe] +0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe -# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff] -0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff +# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff] +0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff # GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1] 0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt index 688212e51c427..2fb9c23ed5ec5 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt @@ -1,8 +1,8 @@ # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s -# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05] -0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05 +# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05] +0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05 # GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92] 0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92 From d9e4309b451c1b24d4e0a6304057663b877e5266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20B=C3=B6ck?= Date: Fri, 23 Feb 2024 12:50:20 +0100 Subject: [PATCH 344/351] [mlir][NFC] Fix format specifier warning on Windows `%ld` specifier is defined to work on values of type `long`. The parameter given to `fprintf` is of type `intptr_t` whose actual underlying integer type is unspecified. On Unix systems it happens to commonly be `long` but on 64-bit Windows it is defined as `long long`. The cross-platform way to print a `intptr_t` is to use `PRIdPTR` which expands to the correct format specifier for `intptr_t`. This avoids any undefined behaviour and compiler warnings. --- mlir/test/CAPI/llvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c index 5a78fac91a509..1817988dd67dd 100644 --- a/mlir/test/CAPI/llvm.c +++ b/mlir/test/CAPI/llvm.c @@ -15,6 +15,7 @@ #include "mlir-c/Support.h" #include +#include #include #include #include @@ -105,7 +106,7 @@ static int testStructTypeCreation(MlirContext ctx) { // CHECK: i8 // CHECK: i32 // CHECK: i64 - fprintf(stderr, "num elements: %ld\n", + fprintf(stderr, "num elements: %" PRIdPTR "\n", mlirLLVMStructTypeGetNumElementTypes(literal)); for (intptr_t i = 0; i < 3; ++i) { mlirTypeDump(mlirLLVMStructTypeGetElementType(literal, i)); From 6ac2c0488f0e06036fc2bd7a94bea71fb930b363 Mon Sep 17 00:00:00 2001 From: tsitdikov Date: Fri, 23 Feb 2024 11:57:14 +0000 Subject: [PATCH 345/351] Add TestArmSME dependency to mlir-opt library. TestArmSME was added in https://github.com/llvm/llvm-project/commit/e1326434742980b03433464dd9435ea66ad5be47, now we need to add dependency on it. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index a34874efa5b19..853d136d9478f 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -9170,6 +9170,7 @@ cc_binary( "//mlir/test:TestAffine", "//mlir/test:TestAnalysis", "//mlir/test:TestArith", + "//mlir/test:TestArmSME", "//mlir/test:TestBufferization", "//mlir/test:TestControlFlow", "//mlir/test:TestDLTI", From f1e0392b822e06f39c49df3ba594f4c98f608ba0 Mon Sep 17 00:00:00 2001 From: Garvit Gupta <152526799+quic-garvgupt@users.noreply.github.com> Date: Fri, 23 Feb 2024 17:31:58 +0530 Subject: [PATCH 346/351] [RISCV] Disable generation of asynchronous unwind tables for RISCV baremetal (#81727) The below culprit patch enabled the generation of asynchronous unwind tables (-funwind-tables=2) by default for RISCV for both linux and RISCVToolChain baremetal object. However, since there are 2 baremetal toolchain objects for RISCV, this created a discrepancy between their behavior. Moreover, enabling the generation of asynchronous unwind tables based on whether `-gcc-toolchain` option is present or not doesn't seem to be the best criteria to decide on the same. This patch make the behavior consistent by disabling the unwind tables in RISCVToolChain Baremetal object. Culprit Patch - https://reviews.llvm.org/D145164 --- clang/lib/Driver/ToolChains/RISCVToolchain.cpp | 5 +++++ clang/lib/Driver/ToolChains/RISCVToolchain.h | 2 ++ clang/test/Driver/riscv-features.c | 8 ++++++++ 3 files changed, 15 insertions(+) diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp index 85beb945cbf6f..624099d21ae12 100644 --- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp +++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp @@ -86,6 +86,11 @@ RISCVToolChain::GetUnwindLibType(const llvm::opt::ArgList &Args) const { return ToolChain::UNW_None; } +ToolChain::UnwindTableLevel RISCVToolChain::getDefaultUnwindTableLevel( + const llvm::opt::ArgList &Args) const { + return UnwindTableLevel::None; +} + void RISCVToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.h b/clang/lib/Driver/ToolChains/RISCVToolchain.h index cec817ef7190b..fa0aa265d842b 100644 --- a/clang/lib/Driver/ToolChains/RISCVToolchain.h +++ b/clang/lib/Driver/ToolChains/RISCVToolchain.h @@ -28,6 +28,8 @@ class LLVM_LIBRARY_VISIBILITY RISCVToolChain : public Generic_ELF { RuntimeLibType GetDefaultRuntimeLibType() const override; UnwindLibType GetUnwindLibType(const llvm::opt::ArgList &Args) const override; + UnwindTableLevel + getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override; void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c index a108383e29fb6..fc5fb0f27e3af 100644 --- a/clang/test/Driver/riscv-features.c +++ b/clang/test/Driver/riscv-features.c @@ -41,6 +41,14 @@ // FAST-UNALIGNED-ACCESS: "-target-feature" "+fast-unaligned-access" // NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-fast-unaligned-access" +// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain="" -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE +// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain="" -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE +// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain="" -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE +// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain="" -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE +// +// UWTABLE: "-funwind-tables=2" +// NOUWTABLE-NOT: "-funwind-tables=2" + // RUN: %clang --target=riscv32-linux -### %s -fsyntax-only 2>&1 \ // RUN: | FileCheck %s -check-prefix=DEFAULT-LINUX // RUN: %clang --target=riscv64-linux -### %s -fsyntax-only 2>&1 \ From 3b3d0978c334702114131e4dab549aa25b9f0ad4 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 23 Feb 2024 12:12:50 +0000 Subject: [PATCH 347/351] [Clang] Fix acle_sme_zero.c once more. --- clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c index 8ea80bc6568fe..a852ffa09c60e 100644 --- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c +++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c @@ -1,6 +1,8 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fsyntax-only -verify %s +#include + void test_svzero_args(uint64_t m) { svzero_za(0); // expected-error {{too many arguments to function call, expected 0, have 1}} svzero_za(m); // expected-error {{too many arguments to function call, expected 0, have 1}} From bcf9826a5392f40063869c3d2b72a5cd1b87d14b Mon Sep 17 00:00:00 2001 From: Johannes Reifferscheid Date: Fri, 23 Feb 2024 13:15:08 +0100 Subject: [PATCH 348/351] [MLIR] Expose approximation patterns for tanh/erf. (#82750) These patterns can already be used via populateMathPolynomialApproximationPatterns, but that includes a number of other patterns that may not be needed. There are already similar functions for expansion. For now only adding tanh and erf since I have a concrete use case for these two. --- mlir/include/mlir/Dialect/Math/Transforms/Passes.h | 3 +++ .../Math/Transforms/PolynomialApproximation.cpp | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h index 010dde5ea7384..11b2c7a7afa2f 100644 --- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h @@ -45,6 +45,9 @@ struct MathPolynomialApproximationOptions { bool enableAvx2 = false; }; +void populatePolynomialApproximateTanhPattern(RewritePatternSet &patterns); +void populatePolynomialApproximateErfPattern(RewritePatternSet &patterns); + void populateMathPolynomialApproximationPatterns( RewritePatternSet &patterns, const MathPolynomialApproximationOptions &options = {}); diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp index 71e4e13103f51..962cb28b7c2ab 100644 --- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp +++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp @@ -1471,6 +1471,16 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op, //----------------------------------------------------------------------------// +void mlir::populatePolynomialApproximateTanhPattern( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +void mlir::populatePolynomialApproximateErfPattern( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + void mlir::populateMathPolynomialApproximationPatterns( RewritePatternSet &patterns, const MathPolynomialApproximationOptions &options) { From ddb4450a468072b5c066c29f4821edec4689d500 Mon Sep 17 00:00:00 2001 From: r4nt Date: Fri, 23 Feb 2024 13:18:00 +0100 Subject: [PATCH 349/351] [ClangFormat] Fix indent in child lines within a macro argument. (#82523) When reconstructing lines from a macro expansion, make sure that lines at different levels in the expanded code get indented correctly as part of the macro argument. --- clang/lib/Format/MacroCallReconstructor.cpp | 68 +++++---- clang/lib/Format/Macros.h | 10 +- clang/lib/Format/UnwrappedLineParser.cpp | 6 + clang/lib/Format/UnwrappedLineParser.h | 2 + .../Format/FormatTestMacroExpansion.cpp | 21 ++- .../Format/MacroCallReconstructorTest.cpp | 129 ++++++++++++------ 6 files changed, 163 insertions(+), 73 deletions(-) diff --git a/clang/lib/Format/MacroCallReconstructor.cpp b/clang/lib/Format/MacroCallReconstructor.cpp index cbdd1683c54d1..101acefdfe7a3 100644 --- a/clang/lib/Format/MacroCallReconstructor.cpp +++ b/clang/lib/Format/MacroCallReconstructor.cpp @@ -33,7 +33,7 @@ void forEachToken(const UnwrappedLine &Line, const T &Call, FormatToken *Parent = nullptr) { bool First = true; for (const auto &N : Line.Tokens) { - Call(N.Tok, Parent, First); + Call(N.Tok, Parent, First, Line.Level); First = false; for (const auto &Child : N.Children) forEachToken(Child, Call, N.Tok); @@ -44,7 +44,7 @@ MacroCallReconstructor::MacroCallReconstructor( unsigned Level, const llvm::DenseMap> &ActiveExpansions) - : Level(Level), IdToReconstructed(ActiveExpansions) { + : Result(Level), IdToReconstructed(ActiveExpansions) { Result.Tokens.push_back(std::make_unique()); ActiveReconstructedLines.push_back(&Result); } @@ -52,9 +52,8 @@ MacroCallReconstructor::MacroCallReconstructor( void MacroCallReconstructor::addLine(const UnwrappedLine &Line) { assert(State != Finalized); LLVM_DEBUG(llvm::dbgs() << "MCR: new line...\n"); - forEachToken(Line, [&](FormatToken *Token, FormatToken *Parent, bool First) { - add(Token, Parent, First); - }); + forEachToken(Line, [&](FormatToken *Token, FormatToken *Parent, bool First, + unsigned Level) { add(Token, Parent, First, Level); }); assert(InProgress || finished()); } @@ -62,8 +61,8 @@ UnwrappedLine MacroCallReconstructor::takeResult() && { finalize(); assert(Result.Tokens.size() == 1 && Result.Tokens.front()->Children.size() == 1); - UnwrappedLine Final = - createUnwrappedLine(*Result.Tokens.front()->Children.front(), Level); + UnwrappedLine Final = createUnwrappedLine( + *Result.Tokens.front()->Children.front(), Result.Level); assert(!Final.Tokens.empty()); return Final; } @@ -72,7 +71,8 @@ UnwrappedLine MacroCallReconstructor::takeResult() && { // ExpandedParent in the incoming unwrapped line. \p First specifies whether it // is the first token in a given unwrapped line. void MacroCallReconstructor::add(FormatToken *Token, - FormatToken *ExpandedParent, bool First) { + FormatToken *ExpandedParent, bool First, + unsigned Level) { LLVM_DEBUG( llvm::dbgs() << "MCR: Token: " << Token->TokenText << ", Parent: " << (ExpandedParent ? ExpandedParent->TokenText : "") @@ -102,7 +102,7 @@ void MacroCallReconstructor::add(FormatToken *Token, First = true; } - prepareParent(ExpandedParent, First); + prepareParent(ExpandedParent, First, Level); if (Token->MacroCtx) { // If this token was generated by a macro call, add the reconstructed @@ -129,7 +129,7 @@ void MacroCallReconstructor::add(FormatToken *Token, // is the parent of ActiveReconstructedLines.back() in the reconstructed // unwrapped line. void MacroCallReconstructor::prepareParent(FormatToken *ExpandedParent, - bool NewLine) { + bool NewLine, unsigned Level) { LLVM_DEBUG({ llvm::dbgs() << "ParentMap:\n"; debugParentMap(); @@ -172,7 +172,7 @@ void MacroCallReconstructor::prepareParent(FormatToken *ExpandedParent, } assert(!ActiveReconstructedLines.empty()); ActiveReconstructedLines.back()->Tokens.back()->Children.push_back( - std::make_unique()); + std::make_unique(Level)); ActiveReconstructedLines.push_back( &*ActiveReconstructedLines.back()->Tokens.back()->Children.back()); } else if (parentLine().Tokens.back()->Tok != Parent) { @@ -424,7 +424,8 @@ bool MacroCallReconstructor::processNextReconstructed() { SpelledParentToReconstructedParent[MacroCallStructure.back() .ParentLastToken] = Token; appendToken(Token); - prepareParent(Token, /*NewLine=*/true); + prepareParent(Token, /*NewLine=*/true, + MacroCallStructure.back().Line->Level); Token->MacroParent = true; return false; } @@ -435,7 +436,8 @@ bool MacroCallReconstructor::processNextReconstructed() { [MacroCallStructure.back().Line->Tokens.back()->Tok] = Token; Token->MacroParent = true; appendToken(Token, MacroCallStructure.back().Line); - prepareParent(Token, /*NewLine=*/true); + prepareParent(Token, /*NewLine=*/true, + MacroCallStructure.back().Line->Level); return true; } if (Token->is(tok::r_paren)) { @@ -509,16 +511,36 @@ MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line, for (const auto &N : Line.Tokens) { Result.Tokens.push_back(N->Tok); UnwrappedLineNode &Current = Result.Tokens.back(); - for (const auto &Child : N->Children) { - if (Child->Tokens.empty()) - continue; - Current.Children.push_back(createUnwrappedLine(*Child, Level + 1)); - } - if (Current.Children.size() == 1 && - Current.Tok->isOneOf(tok::l_paren, tok::comma)) { - Result.Tokens.splice(Result.Tokens.end(), - Current.Children.front().Tokens); - Current.Children.clear(); + auto NumChildren = + std::count_if(N->Children.begin(), N->Children.end(), + [](const auto &Child) { return !Child->Tokens.empty(); }); + if (NumChildren == 1 && Current.Tok->isOneOf(tok::l_paren, tok::comma)) { + // If we only have one child, and the child is due to a macro expansion + // (either attached to a left parenthesis or comma), merge the child into + // the current line to prevent forced breaks for macro arguments. + auto *Child = std::find_if( + N->Children.begin(), N->Children.end(), + [](const auto &Child) { return !Child->Tokens.empty(); }); + auto Line = createUnwrappedLine(**Child, Level); + Result.Tokens.splice(Result.Tokens.end(), Line.Tokens); + } else if (NumChildren > 0) { + // When there are multiple children with different indent, make sure that + // we indent them: + // 1. One level below the current line's level. + // 2. At the correct level relative to each other. + unsigned MinChildLevel = + std::min_element(N->Children.begin(), N->Children.end(), + [](const auto &E1, const auto &E2) { + return E1->Level < E2->Level; + }) + ->get() + ->Level; + for (const auto &Child : N->Children) { + if (Child->Tokens.empty()) + continue; + Current.Children.push_back(createUnwrappedLine( + *Child, Level + 1 + (Child->Level - MinChildLevel))); + } } } return Result; diff --git a/clang/lib/Format/Macros.h b/clang/lib/Format/Macros.h index 1964624e828ce..d2f7fe502364c 100644 --- a/clang/lib/Format/Macros.h +++ b/clang/lib/Format/Macros.h @@ -231,8 +231,9 @@ class MacroCallReconstructor { UnwrappedLine takeResult() &&; private: - void add(FormatToken *Token, FormatToken *ExpandedParent, bool First); - void prepareParent(FormatToken *ExpandedParent, bool First); + void add(FormatToken *Token, FormatToken *ExpandedParent, bool First, + unsigned Level); + void prepareParent(FormatToken *ExpandedParent, bool First, unsigned Level); FormatToken *getParentInResult(FormatToken *Parent); void reconstruct(FormatToken *Token); void startReconstruction(FormatToken *Token); @@ -272,6 +273,8 @@ class MacroCallReconstructor { // FIXME: Investigate changing UnwrappedLine to a pointer type and using it // instead of rolling our own type. struct ReconstructedLine { + explicit ReconstructedLine(unsigned Level) : Level(Level) {} + unsigned Level; llvm::SmallVector> Tokens; }; @@ -373,9 +376,6 @@ class MacroCallReconstructor { // \- ) llvm::SmallVector MacroCallStructure; - // Level the generated UnwrappedLine will be at. - const unsigned Level; - // Maps from identifier of the macro call to an unwrapped line containing // all tokens of the macro call. const llvm::DenseMap> diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 8f6453a25d9d4..3a424bdcde793 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -90,6 +90,12 @@ class ScopedDeclarationState { } // end anonymous namespace +std::ostream &operator<<(std::ostream &Stream, const UnwrappedLine &Line) { + llvm::raw_os_ostream OS(Stream); + printLine(OS, Line); + return Stream; +} + class ScopedLineState { public: ScopedLineState(UnwrappedLineParser &Parser, diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index 739298690bbd7..1403533a2d0ef 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -420,6 +420,8 @@ struct UnwrappedLineNode { SmallVector Children; }; +std::ostream &operator<<(std::ostream &Stream, const UnwrappedLine &Line); + } // end namespace format } // end namespace clang diff --git a/clang/unittests/Format/FormatTestMacroExpansion.cpp b/clang/unittests/Format/FormatTestMacroExpansion.cpp index 653ec2a94c64d..85ab6ea3794e8 100644 --- a/clang/unittests/Format/FormatTestMacroExpansion.cpp +++ b/clang/unittests/Format/FormatTestMacroExpansion.cpp @@ -48,7 +48,7 @@ TEST_F(FormatTestMacroExpansion, UnexpandConfiguredMacros) { )", Style); verifyIncompleteFormat("ID3({, ID(a *b),\n" - " ;\n" + " ;\n" " });", Style); @@ -131,9 +131,9 @@ ID(CALL(CALL(a * b))); EXPECT_EQ(R"( ID3( { - CLASS - a *b; - }; + CLASS + a *b; + }; }, ID(x *y); , @@ -287,6 +287,19 @@ TEST_F(FormatTestMacroExpansion, Style); } +TEST_F(FormatTestMacroExpansion, IndentChildrenWithinMacroCall) { + FormatStyle Style = getGoogleStyleWithColumns(22); + Style.Macros.push_back("MACRO(a, b)=a=(b)"); + verifyFormat("void f() {\n" + " MACRO(a b, call([] {\n" + " if (expr) {\n" + " indent();\n" + " }\n" + " }));\n" + "}", + Style); +} + } // namespace } // namespace test } // namespace format diff --git a/clang/unittests/Format/MacroCallReconstructorTest.cpp b/clang/unittests/Format/MacroCallReconstructorTest.cpp index 6e6900577d165..9df21eae70cb7 100644 --- a/clang/unittests/Format/MacroCallReconstructorTest.cpp +++ b/clang/unittests/Format/MacroCallReconstructorTest.cpp @@ -151,17 +151,21 @@ class MacroCallReconstructorTest : public ::testing::Test { Lex.Allocator, Lex.IdentTable); } - UnwrappedLine line(llvm::ArrayRef Tokens) { + UnwrappedLine line(llvm::ArrayRef Tokens, unsigned Level = 0) { UnwrappedLine Result; + Result.Level = Level; for (FormatToken *Tok : Tokens) Result.Tokens.push_back(UnwrappedLineNode(Tok)); return Result; } - UnwrappedLine line(llvm::StringRef Text) { return line({lex(Text)}); } + UnwrappedLine line(llvm::StringRef Text, unsigned Level = 0) { + return line({lex(Text)}, Level); + } - UnwrappedLine line(llvm::ArrayRef Chunks) { + UnwrappedLine line(llvm::ArrayRef Chunks, unsigned Level = 0) { UnwrappedLine Result; + Result.Level = Level; for (const Chunk &Chunk : Chunks) { Result.Tokens.insert(Result.Tokens.end(), Chunk.Tokens.begin(), Chunk.Tokens.end()); @@ -186,6 +190,8 @@ class MacroCallReconstructorTest : public ::testing::Test { }; bool matchesTokens(const UnwrappedLine &L1, const UnwrappedLine &L2) { + if (L1.Level != L2.Level) + return false; if (L1.Tokens.size() != L2.Tokens.size()) return false; for (auto L1It = L1.Tokens.begin(), L2It = L2.Tokens.begin(); @@ -288,7 +294,8 @@ TEST_F(MacroCallReconstructorTest, StatementSequence) { matchesLine(line( {U1.consume("SEMI"), children({line({U2.consume("SEMI"), - children({line(U3.consume("SEMI"))})})})}))); + children({line(U3.consume("SEMI"), 2)})}, + 1)})}))); } TEST_F(MacroCallReconstructorTest, NestedBlock) { @@ -337,9 +344,9 @@ TEST_F(MacroCallReconstructorTest, NestedBlock) { auto Expected = line({Chunk2Start, children({ - line(Chunk2LBrace), - line({Chunk1, Chunk2Mid}), - line(Chunk2RBrace), + line(Chunk2LBrace, 1), + line({Chunk1, Chunk2Mid}, 1), + line(Chunk2RBrace, 1), }), Chunk2End}); EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected)); @@ -379,9 +386,11 @@ TEST_F(MacroCallReconstructorTest, NestedChildBlocks) { Unexp.addLine( line({E.consume("f([] {"), children({line({E.consume("f([] {"), - children({line(E.consume("return a * b;"))}), - E.consume("})")})}), - E.consume("})")})); + children({line(E.consume("return a * b;"), 3)}), + E.consume("})")}, + 2)}), + E.consume("})")}, + 1)); Unexp.addLine(line(E.consume("}"))); EXPECT_TRUE(Unexp.finished()); @@ -407,13 +416,15 @@ TEST_F(MacroCallReconstructorTest, NestedChildBlocks) { auto Expected = line({ Chunk3Start, children({ - line(Chunk3LBrace), - line({ - Chunk2Start, - Chunk1, - Chunk2End, - }), - line(Chunk3RBrace), + line(Chunk3LBrace, 1), + line( + { + Chunk2Start, + Chunk1, + Chunk2End, + }, + 2), + line(Chunk3RBrace, 1), }), Chunk3End, }); @@ -469,8 +480,8 @@ TEST_F(MacroCallReconstructorTest, MultipleToplevelUnwrappedLines) { auto Expected = line({ U.consume("ID("), children({ - line(U.consume("x;")), - line(U.consume("x")), + line(U.consume("x;"), 1), + line(U.consume("x"), 1), }), U.consume(", y)"), }); @@ -524,9 +535,9 @@ TEST_F(MacroCallReconstructorTest, NestedCallsMultipleLines) { auto Expected = line({ Chunk2Start, children({ - line({Chunk2LBrace}), - line({Chunk1, Chunk2Semi}), - line({Chunk2RBrace}), + line({Chunk2LBrace}, 1), + line({Chunk1, Chunk2Semi}, 1), + line({Chunk2RBrace}, 1), }), Chunk2End, }); @@ -556,15 +567,17 @@ TEST_F(MacroCallReconstructorTest, ParentOutsideMacroCall) { auto Expected = line({ Prefix, children({ - line({ - U.consume("ID("), - children({ - line(U.consume("x;")), - line(U.consume("y;")), - line(U.consume("z;")), - }), - U.consume(")"), - }), + line( + { + U.consume("ID("), + children({ + line(U.consume("x;"), 2), + line(U.consume("y;"), 2), + line(U.consume("z;"), 2), + }), + U.consume(")"), + }, + 1), }), Postfix, }); @@ -590,7 +603,7 @@ TEST_F(MacroCallReconstructorTest, ChildrenSplitAcrossArguments) { Matcher U(Call, Lex); auto Expected = line({ U.consume("CALL({"), - children(line(U.consume("a;"))), + children(line(U.consume("a;"), 1)), U.consume(", b; })"), }); EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected)); @@ -620,16 +633,20 @@ TEST_F(MacroCallReconstructorTest, ChildrenAfterMacroCall) { Matcher U(Call, Lex); auto Expected = line({ U.consume("CALL({"), - children(line(U.consume("a"))), + children(line(U.consume("a"), 1)), U.consume(", b)"), Semi, - children(line({ - SecondLine, - children(line({ - ThirdLine, - Postfix, - })), - })), + children(line( + { + SecondLine, + children(line( + { + ThirdLine, + Postfix, + }, + 2)), + }, + 1)), }); EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected)); } @@ -655,7 +672,37 @@ TEST_F(MacroCallReconstructorTest, InvalidCodeSplittingBracesAcrossArgs) { Matcher U(Call, Lex); auto Expected = line({ Prefix, - children({line(U.consume("M({,x,)"))}), + children({line(U.consume("M({,x,)"), 1)}), + }); + EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected)); +} + +TEST_F(MacroCallReconstructorTest, IndentLevelInExpandedCode) { + auto Macros = createExpander({"ID(a)=a"}); + Expansion Exp(Lex, *Macros); + TokenList Call = Exp.expand("ID", {std::string("[] { { x; } }")}); + + MacroCallReconstructor Unexp(0, Exp.getUnexpanded()); + Matcher E(Exp.getTokens(), Lex); + Unexp.addLine(line({ + E.consume("[] {"), + children({ + line(E.consume("{"), 1), + line(E.consume("x;"), 2), + line(E.consume("}"), 1), + }), + E.consume("}"), + })); + EXPECT_TRUE(Unexp.finished()); + Matcher U(Call, Lex); + auto Expected = line({ + U.consume("ID([] {"), + children({ + line(U.consume("{"), 1), + line(U.consume("x;"), 2), + line(U.consume("}"), 1), + }), + U.consume("})"), }); EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected)); } From b639dd319ef9245dc28883f431a3ffafc01c57dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Tue, 20 Feb 2024 15:34:11 +0100 Subject: [PATCH 350/351] [GlobalIsel] Combine ADDE Clang has them as builtins (__builtin_addc). The middle end has no intrinsics for them. They are used in legalization operations. AArch64: ADCS Add with carry and set flags On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 10 +- .../CodeGen/GlobalISel/GenericMachineInstrs.h | 17 + .../include/llvm/Target/GlobalISel/Combine.td | 8 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 209 ++ .../AArch64/GlobalISel/combine-adde.mir | 300 +++ llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll | 48 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 1745 ++++++++++------- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 975 ++++++--- 8 files changed, 2335 insertions(+), 977 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 23728636498ba..abc2ebdfd878c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -810,12 +810,15 @@ class CombinerHelper { /// Combine selects. bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine ands, + /// Combine ands. bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo); - /// Combine ors, + /// Combine ors. bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Combine addes. + bool matchAddCarryInOut(MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; @@ -919,6 +922,7 @@ class CombinerHelper { bool isZeroOrZeroSplat(Register Src, bool AllowUndefs); bool isConstantSplatVector(Register Src, int64_t SplatValue, bool AllowUndefs); + bool isConstantOrConstantVectorI(Register Src); std::optional getConstantOrConstantSplatVector(Register Src); @@ -930,6 +934,8 @@ class CombinerHelper { // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y. bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo); + + bool isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const; }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index f5a6528d10a97..e46d2d1aac0e8 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr { Register getCarryOutReg() const { return getReg(1); } MachineOperand &getLHS() { return getOperand(2); } MachineOperand &getRHS() { return getOperand(3); } + Register getLHSReg() { return getOperand(2).getReg(); } + Register getRHSReg() { return getOperand(3).getReg(); } static bool classof(const MachineInstr *MI) { switch (MI->getOpcode()) { @@ -448,6 +450,21 @@ class GAddSubCarryInOut : public GAddSubCarryOut { } }; +/// Represents overflowing add operations that also consume a carry-in. +/// G_UADDE, G_SADDE +class GAddCarryInOut : public GAddSubCarryInOut { +public: + static bool classof(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_UADDE: + case TargetOpcode::G_SADDE: + return true; + default: + return false; + } + } +}; + /// Represents a call to an intrinsic. class GIntrinsic final : public GenericMachineInstr { public: diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 17757ca3e4111..22a2aa1c78be5 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1260,6 +1260,12 @@ def match_ors : GICombineRule< [{ return Helper.matchOr(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def match_addes : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SADDE, G_UADDE):$root, + [{ return Helper.matchAddCarryInOut(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + // Combines concat operations def concat_matchinfo : GIDefMatchData<"SmallVector">; def combine_concat_vector : GICombineRule< @@ -1343,7 +1349,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, and_or_disjoint_mask, fma_combines, fold_binop_into_select, sub_add_reg, select_to_minmax, redundant_binop_in_equality, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, - combine_concat_vector]>; + combine_concat_vector, match_addes]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e8a5c6fedc395..96cc6e8c06c1d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6354,6 +6354,23 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) { return Value; } +bool CombinerHelper::isConstantOrConstantVectorI(Register Src) { + auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI); + if (IConstant) + return true; + GBuildVector *BuildVector = getOpcodeDef(Src, MRI); + if (!BuildVector) + return false; + unsigned NumSources = BuildVector->getNumSources(); + for (unsigned I = 0; I < NumSources; ++I) { + std::optional IConstant = + getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI); + if (!IConstant) + return false; + } + return true; // FIXME: G_SPLAT_VECTOR +} + // TODO: use knownbits to determine zeros bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo) { @@ -6918,3 +6935,195 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) { return false; } + +bool CombinerHelper::isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const { + // Copy. + if (ToTy == FromTy) + return true; + + if (isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {ToTy, FromTy}})) + return true; + + if (isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {ToTy, FromTy}})) + return true; + + return false; +} + +bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, + BuildFnTy &MatchInfo) { + GAddCarryInOut *Add = cast(&MI); + + // adde has no flags. + Register Dst = Add->getDstReg(); + Register Carry = Add->getCarryOutReg(); + Register CarryIn = Add->getCarryInReg(); + Register LHS = Add->getLHSReg(); + Register RHS = Add->getRHSReg(); + bool IsSigned = Add->isSigned(); + LLT DstTy = MRI.getType(Dst); + LLT CarryTy = MRI.getType(Carry); + LLT OperandTy = MRI.getType(LHS); + LLT CarryInTy = MRI.getType(CarryIn); + + // FIXME: handle undef + + // fold sadde, if the carry is dead -> add(add(LHS, RHS), + // zextOrTrunc(CarryIn)), undef. + if (MRI.use_nodbg_empty(Carry) && IsSigned && MRI.hasOneNonDBGUse(Dst) && + isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) && + isZExtOrTruncLegal(DstTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto A = B.buildAdd(DstTy, LHS, RHS); + Register AReg = A.getReg(0); + auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn); + Register ZextCarryInReg = ZextCarryIn.getReg(0); + B.buildAdd(Dst, AReg, ZextCarryInReg); + B.buildUndef(Carry); + }; + return true; + } + + // We want do fold the [u|s]adde. + if (!MRI.hasOneNonDBGUse(Dst) || !MRI.hasOneNonDBGUse(Carry)) + return false; + + // The parameters of the adde must be integer-like. + std::optional MaybeLHS = getConstantOrConstantSplatVector(LHS); + std::optional MaybeRHS = getConstantOrConstantSplatVector(RHS); + std::optional MaybeCarryIn = getConstantOrConstantSplatVector(CarryIn); + + // fold adde(c, c, c) -> c, carry + if (MaybeLHS && MaybeRHS && MaybeCarryIn && + isConstantLegalOrBeforeLegalizer(DstTy) && + isConstantLegalOrBeforeLegalizer(CarryTy)) { + // They must all have the same bitwidth. Otherwise APInt might + // assert. Prelegalization, they may have widely different bitwidths. + unsigned BitWidth = + std::max(std::max(MaybeLHS->getBitWidth(), MaybeRHS->getBitWidth()), + MaybeCarryIn->getBitWidth()); + if (IsSigned) { + APInt LHS = MaybeLHS->sext(BitWidth); + APInt RHS = MaybeRHS->sext(BitWidth); + APInt CarryIn = MaybeCarryIn->zext(BitWidth); + bool FirstOverflowed = false; + bool SecondOverflowed = false; + APInt Result = + LHS.sadd_ov(RHS, FirstOverflowed).sadd_ov(CarryIn, SecondOverflowed); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildConstant(Dst, Result); + B.buildConstant(Carry, FirstOverflowed | SecondOverflowed); + }; + return true; + } else if (!IsSigned) { + APInt LHS = MaybeLHS->zext(BitWidth); + APInt RHS = MaybeRHS->zext(BitWidth); + APInt CarryIn = MaybeCarryIn->zext(BitWidth); + bool FirstOverflowed = false; + bool SecondOverflowed = false; + APInt Result = + LHS.uadd_ov(RHS, FirstOverflowed).uadd_ov(CarryIn, SecondOverflowed); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildConstant(Dst, Result); + B.buildConstant(Carry, FirstOverflowed | SecondOverflowed); + }; + return true; + } + } + + // canonicalize constant to RHS. + if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) { + if (IsSigned) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSAdde(Dst, Carry, RHS, LHS, CarryIn); + }; + return true; + } else { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildUAdde(Dst, Carry, RHS, LHS, CarryIn); + }; + return true; + } + } + + // fold adde(LHS, RHS, 0) -> addo(LHS, RHS) + if (MaybeCarryIn && *MaybeCarryIn == 0) { + if (IsSigned && isLegalOrBeforeLegalizer( + {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}})) { + MatchInfo = [=](MachineIRBuilder &B) { + B.buildSAddo(Dst, Carry, LHS, RHS); + }; + return true; + } else if (!IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}})) + MatchInfo = [=](MachineIRBuilder &B) { + B.buildUAddo(Dst, Carry, LHS, RHS); + }; + return true; + } + + // fold adde(LHS, 0, Carry) -> addo(LHS, Carry) + if (MaybeRHS && *MaybeRHS == 0) { + if (IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + Register ZextCarryInReg = ZextCarryIn.getReg(0); + B.buildSAddo(Dst, Carry, LHS, ZextCarryInReg); + }; + return true; + } else if (!IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + Register ZextCarryInReg = ZextCarryIn.getReg(0); + B.buildUAddo(Dst, Carry, LHS, ZextCarryInReg); + }; + return true; + } + } + + // We lower to 2*addo + 1*or. + if (IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) && + isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto First = B.buildSAddo(DstTy, CarryTy, LHS, RHS); + Register FirstResult = First.getReg(0); + Register FirstCarry = First.getReg(1); + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto Second = B.buildSAddo(DstTy, CarryTy, FirstResult, ZextCarryIn); + Register Result = Second.getReg(0); + Register SecondCarry = Second.getReg(1); + B.buildCopy(Dst, Result); + B.buildOr(Carry, FirstCarry, SecondCarry); + }; + return true; + } else if (!IsSigned && + isLegalOrBeforeLegalizer( + {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) && + isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) && + isZExtOrTruncLegal(OperandTy, CarryInTy)) { + MatchInfo = [=](MachineIRBuilder &B) { + auto First = B.buildUAddo(DstTy, CarryTy, LHS, RHS); + Register FirstResult = First.getReg(0); + Register FirstCarry = First.getReg(1); + auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto Second = B.buildUAddo(DstTy, CarryTy, FirstResult, ZextCarryIn); + Register Result = Second.getReg(0); + Register SecondCarry = Second.getReg(1); + B.buildCopy(Dst, Result); + B.buildOr(Carry, FirstCarry, SecondCarry); + }; + return true; + } + + return false; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir new file mode 100644 index 0000000000000..61c7f56f4b260 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir @@ -0,0 +1,300 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +--- +# add, _ = sadde(_, _, In) +name: carryout_unused +body: | + bb.0.entry: + ; CHECK-LABEL: name: carryout_unused + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: %add:_(s64) = G_ADD [[ADD]], [[ZEXT]] + ; CHECK-NEXT: $x0 = COPY %add(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + $x0 = COPY %add +... +--- +# add, _ = uadde(_, _, In) +name: carryout_unused_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: carryout_unused_unsigned + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_UADDE [[COPY]], [[COPY]], %carry_in + ; CHECK-NEXT: $x0 = COPY %add(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + $x0 = COPY %add +... +--- +# add, multi_c = sadde(L, R, In) +name: multi_use_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: multi_use_unsigned + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_UADDE [[COPY]], [[COPY]], %carry_in + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: %carry_out_ext2:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + ; CHECK-NEXT: $x2 = COPY %carry_out_ext2(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + %carry_out_ext2:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext + $x2 = COPY %carry_out_ext2 +... +--- +# add, c = sadde(L, R, In) +name: constant_fold_signed +body: | + bb.0.entry: + ; CHECK-LABEL: name: constant_fold_signed + ; CHECK: %add:_(s64) = G_CONSTANT i64 29 + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = G_CONSTANT i64 1 + %lhs:_(s64) = G_CONSTANT i64 11 + %rhs:_(s64) = G_CONSTANT i64 17 + %carry_in:_(s1) = G_CONSTANT i1 1 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, In) +name: constant_fold_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: constant_fold_unsigned + ; CHECK: %add:_(s64) = G_CONSTANT i64 27 + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = G_CONSTANT i64 1 + %lhs:_(s64) = G_CONSTANT i64 19 + %rhs:_(s64) = G_CONSTANT i64 7 + %carry_in:_(s1) = G_CONSTANT i1 1 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, In) +name: canonicalize_to_rhs_plus_lower +body: | + bb.0.entry: + ; CHECK-LABEL: name: canonicalize_to_rhs_plus_lower + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %lhs:_(s64) = G_CONSTANT i64 19 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[COPY]], %lhs + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[UADDO1]], [[UADDO3]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY [[UADDO2]](s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = G_CONSTANT i64 19 + %rhs:_(s64) = COPY %3 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = sadde(L, R, 0) +name: fold_to_addo_l_r +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_addo_l_r + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_SADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %4 + %carry_in:_(s1) = G_CONSTANT i1 0 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = sadde(L, 0, CarryIn) +name: fold_to_addo_l_carryin +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_addo_l_carryin + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_SADDO [[COPY]], [[ZEXT]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY %add(s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = G_CONSTANT i64 0 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = sadde(L, R, CarryIn) +name: fold_to_lower_signed +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_lower_signed + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[SADDO:%[0-9]+]]:_(s64), [[SADDO1:%[0-9]+]]:_(s1) = G_SADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: [[SADDO2:%[0-9]+]]:_(s64), [[SADDO3:%[0-9]+]]:_(s1) = G_SADDO [[SADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[SADDO1]], [[SADDO3]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY [[SADDO2]](s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %4 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, CarryIn) +name: fold_to_lower_unsigned +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_lower_unsigned + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1) + ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[UADDO1]], [[UADDO3]] + ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1) + ; CHECK-NEXT: $x0 = COPY [[UADDO2]](s64) + ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %lhs:_(s64) = COPY %3 + %rhs:_(s64) = COPY %4 + %carry_in:_(s1) = G_TRUNC %4 + %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in + %carry_out_ext:_(s64) = G_ANYEXT %carry_out + $x0 = COPY %add + $x1 = COPY %carry_out_ext +... +--- +# add, c = uadde(L, R, CarryIn) +name: fold_to_lower_vectorized +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_to_lower_vectorized + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 + ; CHECK-NEXT: %onebit:_(s1) = G_TRUNC [[COPY4]](s64) + ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64) + ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY3]](s64) + ; CHECK-NEXT: %carry_in:_(<2 x s1>) = G_BUILD_VECTOR %onebit(s1), %onebit(s1) + ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(<2 x s64>), [[UADDO1:%[0-9]+]]:_(<2 x s1>) = G_UADDO %lhs, %rhs + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s64>) = G_ZEXT %carry_in(<2 x s1>) + ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(<2 x s64>), [[UADDO3:%[0-9]+]]:_(<2 x s1>) = G_UADDO [[UADDO]], [[ZEXT]] + ; CHECK-NEXT: %carry_out:_(<2 x s1>) = G_OR [[UADDO1]], [[UADDO3]] + ; CHECK-NEXT: %zext:_(<2 x s64>) = G_ZEXT %carry_out(<2 x s1>) + ; CHECK-NEXT: $q0 = COPY %zext(<2 x s64>) + ; CHECK-NEXT: $q0 = COPY [[UADDO2]](<2 x s64>) + %0:_(s64) = COPY $x0 + %1:_(s64) = COPY $x1 + %2:_(s64) = COPY $x2 + %3:_(s64) = COPY $x3 + %4:_(s64) = COPY $x4 + %onebit:_(s1) = G_TRUNC %4 + %lhs:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64) + %rhs:_(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64) + %carry_in:_(<2 x s1>) = G_BUILD_VECTOR %onebit(s1), %onebit(s1) + %add:_(<2 x s64>), %carry_out:_(<2 x s1>) = G_UADDE %lhs, %rhs, %carry_in + %zext:_(<2 x s64>) = G_ZEXT %carry_out(<2 x s1>) + $q0 = COPY %zext + $q0 = COPY %add +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll index ff5880819020d..f337e6cf55292 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -39,9 +39,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) { ; GFX7-LABEL: v_uaddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX7-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -49,9 +52,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) { ; GFX8-LABEL: v_uaddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -59,9 +65,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) { ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -477,8 +486,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_uaddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_add_u32 s1, s1, s3 +; GFX7-NEXT: s_cselect_b32 s3, 1, 0 +; GFX7-NEXT: s_add_u32 s1, s1, s2 +; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_or_b32 s2, s3, s2 +; GFX7-NEXT: s_and_b32 s2, s2, 1 ; GFX7-NEXT: s_add_u32 s0, s0, s2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: ; return to shader part epilog @@ -486,8 +500,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_uaddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_add_u32 s0, s0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: ; return to shader part epilog @@ -495,8 +514,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 4c1935d06517e..eff845a146ace 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1084,7 +1084,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: v_mov_b32_e32 v1, s12 ; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX7-NEXT: s_mul_i32 s18, s1, s8 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 ; GFX7-NEXT: s_add_u32 s18, s18, s17 ; GFX7-NEXT: s_addc_u32 s17, s23, s22 ; GFX7-NEXT: v_mov_b32_e32 v4, s11 @@ -1095,33 +1095,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_mul_i32 s24, s1, s11 ; GFX7-NEXT: v_readfirstlane_b32 s28, v3 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_readfirstlane_b32 s27, v5 +; GFX7-NEXT: v_readfirstlane_b32 s25, v5 ; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 -; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s27, s23 +; GFX7-NEXT: s_addc_u32 s23, s25, s23 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX7-NEXT: s_mul_i32 s27, s2, s10 +; GFX7-NEXT: s_mul_i32 s25, s2, s10 ; GFX7-NEXT: s_cselect_b32 s22, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s27, s24 +; GFX7-NEXT: s_add_u32 s24, s25, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX7-NEXT: s_addc_u32 s27, s28, s23 +; GFX7-NEXT: s_addc_u32 s25, s28, s23 ; GFX7-NEXT: s_mul_i32 s28, s3, s9 ; GFX7-NEXT: s_cselect_b32 s23, 1, 0 ; GFX7-NEXT: s_add_u32 s28, s28, s24 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX7-NEXT: s_addc_u32 s27, s29, s27 +; GFX7-NEXT: s_addc_u32 s25, s29, s25 ; GFX7-NEXT: s_mul_i32 s29, s4, s8 ; GFX7-NEXT: s_cselect_b32 s24, 1, 0 ; GFX7-NEXT: s_add_u32 s28, s29, s28 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX7-NEXT: s_addc_u32 s27, s30, s27 +; GFX7-NEXT: s_addc_u32 s29, s30, s25 ; GFX7-NEXT: s_mul_i32 s30, s16, s11 -; GFX7-NEXT: s_cselect_b32 s29, 1, 0 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6 ; GFX7-NEXT: s_add_u32 s19, s30, s19 ; GFX7-NEXT: s_addc_u32 s28, s31, s28 @@ -1139,88 +1139,93 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 -; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: s_addc_u32 s28, s35, s28 -; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s26, 0 -; GFX7-NEXT: s_addc_u32 s19, s25, s19 -; GFX7-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_add_u32 s19, s26, s19 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_add_u32 s19, s19, s27 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_or_b32 s26, s26, s27 +; GFX7-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 -; GFX7-NEXT: v_readfirstlane_b32 s26, v0 +; GFX7-NEXT: s_add_u32 s20, s20, s28 +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: s_cselect_b32 s21, 1, 0 +; GFX7-NEXT: s_and_b32 s26, s26, 1 +; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX7-NEXT: s_add_u32 s20, s20, s26 +; GFX7-NEXT: v_readfirstlane_b32 s27, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX7-NEXT: s_cmp_lg_u32 s25, 0 -; GFX7-NEXT: s_addc_u32 s20, s20, s28 -; GFX7-NEXT: s_mul_i32 s25, s16, s14 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_or_b32 s21, s21, s26 +; GFX7-NEXT: s_mul_i32 s26, s16, s14 ; GFX7-NEXT: s_mul_i32 s28, s1, s13 -; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 ; GFX7-NEXT: s_mul_i32 s28, s2, s12 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 ; GFX7-NEXT: s_mul_i32 s28, s3, s11 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 ; GFX7-NEXT: s_mul_i32 s28, s4, s10 -; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_add_u32 s26, s28, s26 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 ; GFX7-NEXT: s_mul_i32 s28, s5, s9 -; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_add_u32 s26, s28, s26 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: s_mul_i32 s28, s6, s8 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s25, s28, s25 -; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_add_u32 s26, s28, s26 +; GFX7-NEXT: s_addc_u32 s27, s35, s27 ; GFX7-NEXT: s_mul_i32 s28, s16, s13 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2 -; GFX7-NEXT: s_add_u32 s27, s28, s27 +; GFX7-NEXT: s_add_u32 s28, s28, s29 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX7-NEXT: s_addc_u32 s25, s35, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: s_mul_i32 s35, s1, s12 -; GFX7-NEXT: s_cselect_b32 s28, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s35, s27 -; GFX7-NEXT: s_addc_u32 s25, s36, s25 +; GFX7-NEXT: s_cselect_b32 s29, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s35, s28 +; GFX7-NEXT: s_addc_u32 s26, s36, s26 ; GFX7-NEXT: s_mul_i32 s36, s2, s11 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s36, s27 +; GFX7-NEXT: s_add_u32 s28, s36, s28 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX7-NEXT: s_addc_u32 s25, s37, s25 +; GFX7-NEXT: s_addc_u32 s26, s37, s26 ; GFX7-NEXT: s_mul_i32 s37, s3, s10 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0 -; GFX7-NEXT: s_add_u32 s27, s37, s27 +; GFX7-NEXT: s_add_u32 s28, s37, s28 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX7-NEXT: s_addc_u32 s25, s38, s25 +; GFX7-NEXT: s_addc_u32 s26, s38, s26 ; GFX7-NEXT: s_mul_i32 s38, s4, s9 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1 -; GFX7-NEXT: s_add_u32 s27, s38, s27 -; GFX7-NEXT: s_addc_u32 s25, s39, s25 +; GFX7-NEXT: s_add_u32 s28, s38, s28 +; GFX7-NEXT: s_addc_u32 s26, s39, s26 ; GFX7-NEXT: s_mul_i32 s39, s5, s8 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0 -; GFX7-NEXT: s_add_u32 s27, s39, s27 -; GFX7-NEXT: s_addc_u32 s25, s40, s25 +; GFX7-NEXT: s_add_u32 s28, s39, s28 +; GFX7-NEXT: s_addc_u32 s26, s40, s26 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 @@ -1228,19 +1233,28 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s34, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 -; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: s_addc_u32 s21, s30, s27 -; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s30, s28 +; GFX7-NEXT: s_cselect_b32 s30, 1, 0 +; GFX7-NEXT: s_and_b32 s21, s21, 1 +; GFX7-NEXT: s_add_u32 s21, s28, s21 +; GFX7-NEXT: s_cselect_b32 s28, 1, 0 +; GFX7-NEXT: s_or_b32 s28, s30, s28 ; GFX7-NEXT: s_cmp_lg_u32 s23, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 -; GFX7-NEXT: s_cmp_lg_u32 s29, 0 +; GFX7-NEXT: s_cmp_lg_u32 s25, 0 ; GFX7-NEXT: s_addc_u32 s22, s22, 0 -; GFX7-NEXT: s_cmp_lg_u32 s27, 0 -; GFX7-NEXT: s_addc_u32 s22, s22, s25 +; GFX7-NEXT: s_add_u32 s22, s22, s26 +; GFX7-NEXT: s_cselect_b32 s23, 1, 0 +; GFX7-NEXT: s_and_b32 s24, s28, 1 +; GFX7-NEXT: s_add_u32 s22, s22, s24 +; GFX7-NEXT: s_cselect_b32 s24, 1, 0 +; GFX7-NEXT: s_or_b32 s23, s23, s24 ; GFX7-NEXT: s_mul_i32 s16, s16, s15 -; GFX7-NEXT: s_addc_u32 s15, s26, s16 +; GFX7-NEXT: s_and_b32 s15, s23, 1 +; GFX7-NEXT: s_cmp_lg_u32 s15, 0 +; GFX7-NEXT: s_addc_u32 s15, s27, s16 ; GFX7-NEXT: s_mul_i32 s1, s1, s14 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0 ; GFX7-NEXT: s_addc_u32 s1, s15, s1 @@ -1257,7 +1271,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s35, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_mul_i32 s6, s6, s9 -; GFX7-NEXT: s_cmp_lg_u32 s28, 0 +; GFX7-NEXT: s_cmp_lg_u32 s29, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s6 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s0, s0, s8 @@ -1305,7 +1319,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: v_mov_b32_e32 v1, s12 ; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: s_mul_i32 s18, s1, s8 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 ; GFX8-NEXT: s_add_u32 s18, s18, s17 ; GFX8-NEXT: s_addc_u32 s17, s23, s22 ; GFX8-NEXT: v_mov_b32_e32 v4, s11 @@ -1316,33 +1330,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_mul_i32 s24, s1, s11 ; GFX8-NEXT: v_readfirstlane_b32 s28, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_readfirstlane_b32 s27, v5 +; GFX8-NEXT: v_readfirstlane_b32 s25, v5 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 -; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s27, s23 +; GFX8-NEXT: s_addc_u32 s23, s25, s23 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX8-NEXT: s_mul_i32 s27, s2, s10 +; GFX8-NEXT: s_mul_i32 s25, s2, s10 ; GFX8-NEXT: s_cselect_b32 s22, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s27, s24 +; GFX8-NEXT: s_add_u32 s24, s25, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX8-NEXT: s_addc_u32 s27, s28, s23 +; GFX8-NEXT: s_addc_u32 s25, s28, s23 ; GFX8-NEXT: s_mul_i32 s28, s3, s9 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0 ; GFX8-NEXT: s_add_u32 s28, s28, s24 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX8-NEXT: s_addc_u32 s27, s29, s27 +; GFX8-NEXT: s_addc_u32 s25, s29, s25 ; GFX8-NEXT: s_mul_i32 s29, s4, s8 ; GFX8-NEXT: s_cselect_b32 s24, 1, 0 ; GFX8-NEXT: s_add_u32 s28, s29, s28 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX8-NEXT: s_addc_u32 s27, s30, s27 +; GFX8-NEXT: s_addc_u32 s29, s30, s25 ; GFX8-NEXT: s_mul_i32 s30, s16, s11 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6 ; GFX8-NEXT: s_add_u32 s19, s30, s19 ; GFX8-NEXT: s_addc_u32 s28, s31, s28 @@ -1360,88 +1374,93 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: s_addc_u32 s28, s35, s28 -; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s26, 0 -; GFX8-NEXT: s_addc_u32 s19, s25, s19 -; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_add_u32 s19, s26, s19 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_add_u32 s19, s19, s27 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_or_b32 s26, s26, s27 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 -; GFX8-NEXT: v_readfirstlane_b32 s26, v0 +; GFX8-NEXT: s_add_u32 s20, s20, s28 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_and_b32 s26, s26, 1 +; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 +; GFX8-NEXT: s_add_u32 s20, s20, s26 +; GFX8-NEXT: v_readfirstlane_b32 s27, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX8-NEXT: s_cmp_lg_u32 s25, 0 -; GFX8-NEXT: s_addc_u32 s20, s20, s28 -; GFX8-NEXT: s_mul_i32 s25, s16, s14 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_or_b32 s21, s21, s26 +; GFX8-NEXT: s_mul_i32 s26, s16, s14 ; GFX8-NEXT: s_mul_i32 s28, s1, s13 -; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 ; GFX8-NEXT: s_mul_i32 s28, s2, s12 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 ; GFX8-NEXT: s_mul_i32 s28, s3, s11 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 ; GFX8-NEXT: s_mul_i32 s28, s4, s10 -; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_add_u32 s26, s28, s26 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 ; GFX8-NEXT: s_mul_i32 s28, s5, s9 -; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_add_u32 s26, s28, s26 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: s_mul_i32 s28, s6, s8 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s25, s28, s25 -; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_add_u32 s26, s28, s26 +; GFX8-NEXT: s_addc_u32 s27, s35, s27 ; GFX8-NEXT: s_mul_i32 s28, s16, s13 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2 -; GFX8-NEXT: s_add_u32 s27, s28, s27 +; GFX8-NEXT: s_add_u32 s28, s28, s29 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX8-NEXT: s_addc_u32 s25, s35, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: s_mul_i32 s35, s1, s12 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s35, s27 -; GFX8-NEXT: s_addc_u32 s25, s36, s25 +; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s35, s28 +; GFX8-NEXT: s_addc_u32 s26, s36, s26 ; GFX8-NEXT: s_mul_i32 s36, s2, s11 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s36, s27 +; GFX8-NEXT: s_add_u32 s28, s36, s28 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX8-NEXT: s_addc_u32 s25, s37, s25 +; GFX8-NEXT: s_addc_u32 s26, s37, s26 ; GFX8-NEXT: s_mul_i32 s37, s3, s10 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0 -; GFX8-NEXT: s_add_u32 s27, s37, s27 +; GFX8-NEXT: s_add_u32 s28, s37, s28 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX8-NEXT: s_addc_u32 s25, s38, s25 +; GFX8-NEXT: s_addc_u32 s26, s38, s26 ; GFX8-NEXT: s_mul_i32 s38, s4, s9 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1 -; GFX8-NEXT: s_add_u32 s27, s38, s27 -; GFX8-NEXT: s_addc_u32 s25, s39, s25 +; GFX8-NEXT: s_add_u32 s28, s38, s28 +; GFX8-NEXT: s_addc_u32 s26, s39, s26 ; GFX8-NEXT: s_mul_i32 s39, s5, s8 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0 -; GFX8-NEXT: s_add_u32 s27, s39, s27 -; GFX8-NEXT: s_addc_u32 s25, s40, s25 +; GFX8-NEXT: s_add_u32 s28, s39, s28 +; GFX8-NEXT: s_addc_u32 s26, s40, s26 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 @@ -1449,19 +1468,28 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s34, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 -; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_addc_u32 s21, s30, s27 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s30, s28 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 +; GFX8-NEXT: s_and_b32 s21, s21, 1 +; GFX8-NEXT: s_add_u32 s21, s28, s21 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_or_b32 s28, s30, s28 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 ; GFX8-NEXT: s_addc_u32 s22, s22, 0 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_addc_u32 s22, s22, s25 +; GFX8-NEXT: s_add_u32 s22, s22, s26 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_and_b32 s24, s28, 1 +; GFX8-NEXT: s_add_u32 s22, s22, s24 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 +; GFX8-NEXT: s_or_b32 s23, s23, s24 ; GFX8-NEXT: s_mul_i32 s16, s16, s15 -; GFX8-NEXT: s_addc_u32 s15, s26, s16 +; GFX8-NEXT: s_and_b32 s15, s23, 1 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_addc_u32 s15, s27, s16 ; GFX8-NEXT: s_mul_i32 s1, s1, s14 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0 ; GFX8-NEXT: s_addc_u32 s1, s15, s1 @@ -1478,7 +1506,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s35, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_mul_i32 s6, s6, s9 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s0, s0, s8 @@ -1510,15 +1538,15 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 ; GFX9-NEXT: s_add_u32 s17, s22, s17 -; GFX9-NEXT: s_addc_u32 s18, s23, s18 -; GFX9-NEXT: s_mul_i32 s23, s1, s8 -; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_addc_u32 s22, s23, s18 +; GFX9-NEXT: s_mul_i32 s18, s1, s8 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 -; GFX9-NEXT: s_add_u32 s17, s23, s17 -; GFX9-NEXT: s_addc_u32 s18, s24, s18 +; GFX9-NEXT: s_add_u32 s18, s18, s17 +; GFX9-NEXT: s_addc_u32 s17, s24, s22 ; GFX9-NEXT: s_mul_i32 s24, s16, s12 ; GFX9-NEXT: s_mul_i32 s26, s1, s11 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 ; GFX9-NEXT: s_add_u32 s24, s26, s24 @@ -1559,16 +1587,21 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_add_u32 s19, s34, s19 ; GFX9-NEXT: s_addc_u32 s24, s35, s24 ; GFX9-NEXT: s_cselect_b32 s34, 1, 0 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_addc_u32 s19, s22, s19 +; GFX9-NEXT: s_add_u32 s19, s23, s19 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 +; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_or_b32 s22, s23, s22 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, 0 -; GFX9-NEXT: s_cmp_lg_u32 s22, 0 -; GFX9-NEXT: s_addc_u32 s20, s20, s24 +; GFX9-NEXT: s_add_u32 s20, s20, s24 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 +; GFX9-NEXT: s_and_b32 s22, s22, 1 +; GFX9-NEXT: s_add_u32 s20, s20, s22 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_or_b32 s21, s21, s22 ; GFX9-NEXT: s_mul_i32 s22, s16, s14 ; GFX9-NEXT: s_mul_i32 s24, s1, s13 -; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 ; GFX9-NEXT: s_add_u32 s22, s24, s22 @@ -1629,18 +1662,27 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_addc_u32 s30, s30, 0 ; GFX9-NEXT: s_cmp_lg_u32 s34, 0 ; GFX9-NEXT: s_addc_u32 s30, s30, 0 -; GFX9-NEXT: s_cmp_lg_u32 s21, 0 -; GFX9-NEXT: s_addc_u32 s21, s30, s24 +; GFX9-NEXT: s_add_u32 s24, s30, s24 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 +; GFX9-NEXT: s_and_b32 s21, s21, 1 +; GFX9-NEXT: s_add_u32 s21, s24, s21 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 +; GFX9-NEXT: s_or_b32 s24, s30, s24 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 -; GFX9-NEXT: s_cmp_lg_u32 s24, 0 -; GFX9-NEXT: s_addc_u32 s22, s26, s22 +; GFX9-NEXT: s_add_u32 s22, s26, s22 +; GFX9-NEXT: s_cselect_b32 s26, 1, 0 +; GFX9-NEXT: s_and_b32 s24, s24, 1 +; GFX9-NEXT: s_add_u32 s22, s22, s24 +; GFX9-NEXT: s_cselect_b32 s24, 1, 0 +; GFX9-NEXT: s_or_b32 s24, s26, s24 ; GFX9-NEXT: s_mul_i32 s16, s16, s15 +; GFX9-NEXT: s_and_b32 s15, s24, 1 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_addc_u32 s15, s23, s16 ; GFX9-NEXT: s_mul_i32 s1, s1, s14 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0 @@ -1663,192 +1705,399 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_i32 s0, s0, s8 ; GFX9-NEXT: s_add_u32 s7, s7, s1 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s18 +; GFX9-NEXT: s_mov_b32 s2, s17 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s4, s20 ; GFX9-NEXT: s_mov_b32 s5, s21 ; GFX9-NEXT: s_mov_b32 s6, s22 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_mul_i256: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10 -; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9 -; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17 -; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18 -; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17 -; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 -; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16 -; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17 -; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16 -; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17 -; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12 -; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12 -; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11 -; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24 -; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10 -; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24 -; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24 -; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23 -; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24 -; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11 -; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23 -; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10 -; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23 -; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23 -; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18 -; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23 -; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0 -; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14 -; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 -; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 -; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 -; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14 -; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23 -; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13 -; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8 -; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 -; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13 -; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 -; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13 -; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24 -; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21 -; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12 -; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 -; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21 -; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11 -; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 -; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21 -; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10 -; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10 -; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21 -; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9 -; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9 -; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21 -; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8 -; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8 -; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0 -; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23 -; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21 -; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14 -; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0 -; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 -; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0 -; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 -; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 -; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23 -; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15 -; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 -; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 -; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 -; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21 -; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0 -; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0 -; GFX10PLUS-NEXT: s_mov_b32 s2, s17 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0 -; GFX10PLUS-NEXT: s_mov_b32 s3, s18 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0 -; GFX10PLUS-NEXT: s_mov_b32 s4, s19 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10PLUS-NEXT: s_mov_b32 s5, s20 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6 -; GFX10PLUS-NEXT: s_mov_b32 s6, s15 -; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7 -; GFX10PLUS-NEXT: s_mov_b32 s1, s16 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_mul_i256: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s17, s0, s10 +; GFX10-NEXT: s_mul_i32 s19, s1, s9 +; GFX10-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX10-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX10-NEXT: s_add_u32 s17, s19, s17 +; GFX10-NEXT: s_addc_u32 s18, s20, s18 +; GFX10-NEXT: s_mul_i32 s20, s2, s8 +; GFX10-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_add_u32 s17, s20, s17 +; GFX10-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX10-NEXT: s_addc_u32 s18, s21, s18 +; GFX10-NEXT: s_mul_i32 s21, s0, s9 +; GFX10-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_add_u32 s16, s21, s16 +; GFX10-NEXT: s_addc_u32 s21, s22, s17 +; GFX10-NEXT: s_mul_i32 s17, s1, s8 +; GFX10-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_add_u32 s17, s17, s16 +; GFX10-NEXT: s_addc_u32 s16, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s0, s12 +; GFX10-NEXT: s_mul_i32 s25, s1, s11 +; GFX10-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX10-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s25, s23 +; GFX10-NEXT: s_addc_u32 s24, s26, s24 +; GFX10-NEXT: s_mul_i32 s26, s2, s10 +; GFX10-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s26, s23 +; GFX10-NEXT: s_addc_u32 s24, s27, s24 +; GFX10-NEXT: s_mul_i32 s27, s3, s9 +; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX10-NEXT: s_cselect_b32 s26, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s27, s23 +; GFX10-NEXT: s_addc_u32 s24, s28, s24 +; GFX10-NEXT: s_mul_i32 s28, s4, s8 +; GFX10-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX10-NEXT: s_cselect_b32 s27, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s28, s23 +; GFX10-NEXT: s_addc_u32 s24, s29, s24 +; GFX10-NEXT: s_mul_i32 s29, s0, s11 +; GFX10-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s29, s18 +; GFX10-NEXT: s_addc_u32 s23, s30, s23 +; GFX10-NEXT: s_mul_i32 s30, s1, s10 +; GFX10-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s30, s18 +; GFX10-NEXT: s_addc_u32 s23, s31, s23 +; GFX10-NEXT: s_mul_i32 s31, s2, s9 +; GFX10-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX10-NEXT: s_cselect_b32 s30, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s31, s18 +; GFX10-NEXT: s_addc_u32 s23, s33, s23 +; GFX10-NEXT: s_mul_i32 s33, s3, s8 +; GFX10-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX10-NEXT: s_cselect_b32 s31, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s33, s18 +; GFX10-NEXT: s_addc_u32 s23, s34, s23 +; GFX10-NEXT: s_cselect_b32 s33, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s22, s18 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_add_u32 s18, s18, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX10-NEXT: s_or_b32 s21, s22, s21 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX10-NEXT: s_addc_u32 s19, s19, 0 +; GFX10-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX10-NEXT: s_add_u32 s19, s19, s23 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_and_b32 s21, s21, 1 +; GFX10-NEXT: s_mul_i32 s23, s1, s13 +; GFX10-NEXT: s_add_u32 s19, s19, s21 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX10-NEXT: s_or_b32 s20, s20, s21 +; GFX10-NEXT: s_mul_i32 s21, s0, s14 +; GFX10-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s2, s12 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s3, s11 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s4, s10 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s5, s9 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s6, s8 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX10-NEXT: s_add_u32 s21, s23, s21 +; GFX10-NEXT: s_mul_i32 s23, s0, s13 +; GFX10-NEXT: s_addc_u32 s22, s34, s22 +; GFX10-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX10-NEXT: s_add_u32 s23, s23, s24 +; GFX10-NEXT: s_addc_u32 s21, s34, s21 +; GFX10-NEXT: s_mul_i32 s34, s1, s12 +; GFX10-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s34, s23 +; GFX10-NEXT: s_addc_u32 s21, s35, s21 +; GFX10-NEXT: s_mul_i32 s35, s2, s11 +; GFX10-NEXT: s_cselect_b32 s34, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s35, s23 +; GFX10-NEXT: s_addc_u32 s21, s36, s21 +; GFX10-NEXT: s_mul_i32 s36, s3, s10 +; GFX10-NEXT: s_cselect_b32 s35, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s36, s23 +; GFX10-NEXT: s_addc_u32 s21, s37, s21 +; GFX10-NEXT: s_mul_i32 s37, s4, s9 +; GFX10-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX10-NEXT: s_cselect_b32 s36, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s37, s23 +; GFX10-NEXT: s_addc_u32 s21, s38, s21 +; GFX10-NEXT: s_mul_i32 s38, s5, s8 +; GFX10-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX10-NEXT: s_cselect_b32 s37, 1, 0 +; GFX10-NEXT: s_add_u32 s23, s38, s23 +; GFX10-NEXT: s_addc_u32 s21, s39, s21 +; GFX10-NEXT: s_cselect_b32 s38, 1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10-NEXT: s_mul_i32 s15, s0, s15 +; GFX10-NEXT: s_addc_u32 s29, s29, 0 +; GFX10-NEXT: s_cmp_lg_u32 s31, 0 +; GFX10-NEXT: s_mul_i32 s1, s1, s14 +; GFX10-NEXT: s_addc_u32 s29, s29, 0 +; GFX10-NEXT: s_cmp_lg_u32 s33, 0 +; GFX10-NEXT: s_mul_i32 s2, s2, s13 +; GFX10-NEXT: s_addc_u32 s29, s29, 0 +; GFX10-NEXT: s_mul_i32 s3, s3, s12 +; GFX10-NEXT: s_add_u32 s23, s29, s23 +; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_and_b32 s20, s20, 1 +; GFX10-NEXT: s_mul_i32 s4, s4, s11 +; GFX10-NEXT: s_add_u32 s20, s23, s20 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_mul_i32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s23, s29, s23 +; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_mul_i32 s6, s6, s9 +; GFX10-NEXT: s_addc_u32 s25, s25, 0 +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_mul_i32 s7, s7, s8 +; GFX10-NEXT: s_addc_u32 s25, s25, 0 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_mul_i32 s0, s0, s8 +; GFX10-NEXT: s_addc_u32 s25, s25, 0 +; GFX10-NEXT: s_add_u32 s21, s25, s21 +; GFX10-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_add_u32 s21, s21, s23 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_or_b32 s23, s25, s23 +; GFX10-NEXT: s_and_b32 s23, s23, 1 +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_addc_u32 s15, s22, s15 +; GFX10-NEXT: s_cmp_lg_u32 s38, 0 +; GFX10-NEXT: s_addc_u32 s1, s15, s1 +; GFX10-NEXT: s_cmp_lg_u32 s37, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lg_u32 s36, 0 +; GFX10-NEXT: s_mov_b32 s2, s16 +; GFX10-NEXT: s_addc_u32 s1, s1, s3 +; GFX10-NEXT: s_cmp_lg_u32 s35, 0 +; GFX10-NEXT: s_mov_b32 s3, s18 +; GFX10-NEXT: s_addc_u32 s1, s1, s4 +; GFX10-NEXT: s_cmp_lg_u32 s34, 0 +; GFX10-NEXT: s_mov_b32 s4, s19 +; GFX10-NEXT: s_addc_u32 s1, s1, s5 +; GFX10-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10-NEXT: s_mov_b32 s5, s20 +; GFX10-NEXT: s_addc_u32 s1, s1, s6 +; GFX10-NEXT: s_mov_b32 s6, s21 +; GFX10-NEXT: s_add_i32 s7, s1, s7 +; GFX10-NEXT: s_mov_b32 s1, s17 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_mul_i256: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mul_i32 s17, s0, s10 +; GFX11-NEXT: s_mul_i32 s19, s1, s9 +; GFX11-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX11-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX11-NEXT: s_add_u32 s17, s19, s17 +; GFX11-NEXT: s_addc_u32 s18, s20, s18 +; GFX11-NEXT: s_mul_i32 s20, s2, s8 +; GFX11-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_add_u32 s17, s20, s17 +; GFX11-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX11-NEXT: s_addc_u32 s18, s21, s18 +; GFX11-NEXT: s_mul_i32 s21, s0, s9 +; GFX11-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_add_u32 s16, s21, s16 +; GFX11-NEXT: s_addc_u32 s17, s22, s17 +; GFX11-NEXT: s_mul_i32 s22, s1, s8 +; GFX11-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_add_u32 s16, s22, s16 +; GFX11-NEXT: s_addc_u32 s17, s23, s17 +; GFX11-NEXT: s_mul_i32 s23, s0, s12 +; GFX11-NEXT: s_mul_i32 s25, s1, s11 +; GFX11-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX11-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s25, s23 +; GFX11-NEXT: s_addc_u32 s24, s26, s24 +; GFX11-NEXT: s_mul_i32 s26, s2, s10 +; GFX11-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX11-NEXT: s_cselect_b32 s25, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s26, s23 +; GFX11-NEXT: s_addc_u32 s24, s27, s24 +; GFX11-NEXT: s_mul_i32 s27, s3, s9 +; GFX11-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX11-NEXT: s_cselect_b32 s26, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s27, s23 +; GFX11-NEXT: s_addc_u32 s24, s28, s24 +; GFX11-NEXT: s_mul_i32 s28, s4, s8 +; GFX11-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX11-NEXT: s_cselect_b32 s27, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s28, s23 +; GFX11-NEXT: s_addc_u32 s24, s29, s24 +; GFX11-NEXT: s_mul_i32 s29, s0, s11 +; GFX11-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s29, s18 +; GFX11-NEXT: s_addc_u32 s23, s30, s23 +; GFX11-NEXT: s_mul_i32 s30, s1, s10 +; GFX11-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s30, s18 +; GFX11-NEXT: s_addc_u32 s23, s31, s23 +; GFX11-NEXT: s_mul_i32 s31, s2, s9 +; GFX11-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX11-NEXT: s_cselect_b32 s30, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s31, s18 +; GFX11-NEXT: s_addc_u32 s23, s33, s23 +; GFX11-NEXT: s_mul_i32 s33, s3, s8 +; GFX11-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX11-NEXT: s_cselect_b32 s31, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s33, s18 +; GFX11-NEXT: s_addc_u32 s23, s34, s23 +; GFX11-NEXT: s_cselect_b32 s33, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s21, s18 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_add_u32 s18, s18, s22 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX11-NEXT: s_or_b32 s21, s21, s22 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX11-NEXT: s_add_u32 s19, s19, s23 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_and_b32 s21, s21, 1 +; GFX11-NEXT: s_mul_i32 s23, s1, s13 +; GFX11-NEXT: s_add_u32 s19, s19, s21 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX11-NEXT: s_or_b32 s20, s20, s21 +; GFX11-NEXT: s_mul_i32 s21, s0, s14 +; GFX11-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s2, s12 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s3, s11 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s4, s10 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s5, s9 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s6, s8 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX11-NEXT: s_add_u32 s21, s23, s21 +; GFX11-NEXT: s_mul_i32 s23, s0, s13 +; GFX11-NEXT: s_addc_u32 s22, s34, s22 +; GFX11-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX11-NEXT: s_add_u32 s23, s23, s24 +; GFX11-NEXT: s_addc_u32 s21, s34, s21 +; GFX11-NEXT: s_mul_i32 s34, s1, s12 +; GFX11-NEXT: s_cselect_b32 s24, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s34, s23 +; GFX11-NEXT: s_addc_u32 s21, s35, s21 +; GFX11-NEXT: s_mul_i32 s35, s2, s11 +; GFX11-NEXT: s_cselect_b32 s34, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s35, s23 +; GFX11-NEXT: s_addc_u32 s21, s36, s21 +; GFX11-NEXT: s_mul_i32 s36, s3, s10 +; GFX11-NEXT: s_cselect_b32 s35, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s36, s23 +; GFX11-NEXT: s_addc_u32 s21, s37, s21 +; GFX11-NEXT: s_mul_i32 s37, s4, s9 +; GFX11-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX11-NEXT: s_cselect_b32 s36, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s37, s23 +; GFX11-NEXT: s_addc_u32 s21, s38, s21 +; GFX11-NEXT: s_mul_i32 s38, s5, s8 +; GFX11-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX11-NEXT: s_cselect_b32 s37, 1, 0 +; GFX11-NEXT: s_add_u32 s23, s38, s23 +; GFX11-NEXT: s_addc_u32 s21, s39, s21 +; GFX11-NEXT: s_cselect_b32 s38, 1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s30, 0 +; GFX11-NEXT: s_mul_i32 s15, s0, s15 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_cmp_lg_u32 s31, 0 +; GFX11-NEXT: s_mul_i32 s1, s1, s14 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_cmp_lg_u32 s33, 0 +; GFX11-NEXT: s_mul_i32 s2, s2, s13 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_mul_i32 s3, s3, s12 +; GFX11-NEXT: s_add_u32 s23, s29, s23 +; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_and_b32 s20, s20, 1 +; GFX11-NEXT: s_mul_i32 s4, s4, s11 +; GFX11-NEXT: s_add_u32 s20, s23, s20 +; GFX11-NEXT: s_cselect_b32 s23, 1, 0 +; GFX11-NEXT: s_mul_i32 s5, s5, s10 +; GFX11-NEXT: s_or_b32 s23, s29, s23 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mul_i32 s6, s6, s9 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_mul_i32 s7, s7, s8 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mul_i32 s0, s0, s8 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s21, s25, s21 +; GFX11-NEXT: s_cselect_b32 s25, 1, 0 +; GFX11-NEXT: s_and_b32 s23, s23, 1 +; GFX11-NEXT: s_add_u32 s21, s21, s23 +; GFX11-NEXT: s_cselect_b32 s23, 1, 0 +; GFX11-NEXT: s_or_b32 s23, s25, s23 +; GFX11-NEXT: s_and_b32 s23, s23, 1 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_addc_u32 s15, s22, s15 +; GFX11-NEXT: s_cmp_lg_u32 s38, 0 +; GFX11-NEXT: s_addc_u32 s1, s15, s1 +; GFX11-NEXT: s_cmp_lg_u32 s37, 0 +; GFX11-NEXT: s_addc_u32 s1, s1, s2 +; GFX11-NEXT: s_cmp_lg_u32 s36, 0 +; GFX11-NEXT: s_mov_b32 s2, s17 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: s_cmp_lg_u32 s35, 0 +; GFX11-NEXT: s_mov_b32 s3, s18 +; GFX11-NEXT: s_addc_u32 s1, s1, s4 +; GFX11-NEXT: s_cmp_lg_u32 s34, 0 +; GFX11-NEXT: s_mov_b32 s4, s19 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s5, s20 +; GFX11-NEXT: s_addc_u32 s1, s1, s6 +; GFX11-NEXT: s_mov_b32 s6, s21 +; GFX11-NEXT: s_add_i32 s7, s1, s7 +; GFX11-NEXT: s_mov_b32 s1, s16 +; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i256: ; GFX12: ; %bb.0: @@ -1917,18 +2166,26 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s18, s33, s18 ; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23 ; GFX12-NEXT: s_cselect_b32 s33, 1, 0 -; GFX12-NEXT: s_cmp_lg_u32 s22, 0 -; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 -; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX12-NEXT: s_add_co_u32 s18, s21, s18 ; GFX12-NEXT: s_cselect_b32 s21, 1, 0 -; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_add_co_u32 s18, s18, s22 +; GFX12-NEXT: s_cselect_b32 s22, 1, 0 ; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX12-NEXT: s_or_b32 s21, s21, s22 +; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 ; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0 -; GFX12-NEXT: s_cmp_lg_u32 s21, 0 -; GFX12-NEXT: s_mul_i32 s21, s0, s14 -; GFX12-NEXT: s_add_co_ci_u32 s19, s19, s23 -; GFX12-NEXT: s_mul_i32 s23, s1, s13 +; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX12-NEXT: s_add_co_u32 s19, s19, s23 ; GFX12-NEXT: s_cselect_b32 s20, 1, 0 +; GFX12-NEXT: s_and_b32 s21, s21, 1 +; GFX12-NEXT: s_mul_i32 s23, s1, s13 +; GFX12-NEXT: s_add_co_u32 s19, s19, s21 +; GFX12-NEXT: s_cselect_b32 s21, 1, 0 +; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX12-NEXT: s_or_b32 s20, s20, s21 +; GFX12-NEXT: s_mul_i32 s21, s0, s14 +; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10 ; GFX12-NEXT: s_add_co_u32 s21, s23, s21 ; GFX12-NEXT: s_mul_i32 s23, s2, s12 ; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 @@ -1956,17 +2213,14 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_u32 s23, s23, s24 ; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21 ; GFX12-NEXT: s_mul_i32 s34, s1, s12 -; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX12-NEXT: s_cselect_b32 s24, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s34, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21 ; GFX12-NEXT: s_mul_i32 s35, s2, s11 -; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 ; GFX12-NEXT: s_cselect_b32 s34, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s35, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21 ; GFX12-NEXT: s_mul_i32 s36, s3, s10 -; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10 ; GFX12-NEXT: s_cselect_b32 s35, 1, 0 ; GFX12-NEXT: s_add_co_u32 s23, s36, s23 ; GFX12-NEXT: s_add_co_ci_u32 s21, s37, s21 @@ -1982,34 +2236,46 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21 ; GFX12-NEXT: s_cselect_b32 s38, 1, 0 ; GFX12-NEXT: s_cmp_lg_u32 s30, 0 -; GFX12-NEXT: s_mul_i32 s1, s1, s14 +; GFX12-NEXT: s_mul_i32 s15, s0, s15 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s31, 0 -; GFX12-NEXT: s_mul_i32 s2, s2, s13 +; GFX12-NEXT: s_mul_i32 s1, s1, s14 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 ; GFX12-NEXT: s_cmp_lg_u32 s33, 0 -; GFX12-NEXT: s_mul_i32 s3, s3, s12 +; GFX12-NEXT: s_mul_i32 s2, s2, s13 ; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 -; GFX12-NEXT: s_cmp_lg_u32 s20, 0 +; GFX12-NEXT: s_mul_i32 s3, s3, s12 +; GFX12-NEXT: s_add_co_u32 s23, s29, s23 +; GFX12-NEXT: s_cselect_b32 s29, 1, 0 +; GFX12-NEXT: s_and_b32 s20, s20, 1 ; GFX12-NEXT: s_mul_i32 s4, s4, s11 -; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23 +; GFX12-NEXT: s_add_co_u32 s20, s23, s20 ; GFX12-NEXT: s_cselect_b32 s23, 1, 0 +; GFX12-NEXT: s_mul_i32 s5, s5, s10 +; GFX12-NEXT: s_or_b32 s23, s29, s23 ; GFX12-NEXT: s_cmp_lg_u32 s26, 0 -; GFX12-NEXT: s_mul_i32 s26, s0, s15 +; GFX12-NEXT: s_mul_i32 s6, s6, s9 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s27, 0 -; GFX12-NEXT: s_mul_i32 s5, s5, s10 +; GFX12-NEXT: s_mul_i32 s7, s7, s8 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 ; GFX12-NEXT: s_cmp_lg_u32 s28, 0 -; GFX12-NEXT: s_mul_i32 s6, s6, s9 +; GFX12-NEXT: s_mul_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_co_u32 s21, s25, s21 +; GFX12-NEXT: s_cselect_b32 s25, 1, 0 +; GFX12-NEXT: s_and_b32 s23, s23, 1 +; GFX12-NEXT: s_add_co_u32 s21, s21, s23 +; GFX12-NEXT: s_cselect_b32 s23, 1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s23, s25, s23 +; GFX12-NEXT: s_and_b32 s23, s23, 1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_lg_u32 s23, 0 -; GFX12-NEXT: s_mul_i32 s7, s7, s8 -; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21 -; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26 +; GFX12-NEXT: s_add_co_ci_u32 s15, s22, s15 ; GFX12-NEXT: s_cmp_lg_u32 s38, 0 -; GFX12-NEXT: s_mul_i32 s0, s0, s8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1 +; GFX12-NEXT: s_add_co_ci_u32 s1, s15, s1 ; GFX12-NEXT: s_cmp_lg_u32 s37, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2 ; GFX12-NEXT: s_cmp_lg_u32 s36, 0 @@ -2024,7 +2290,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_cmp_lg_u32 s24, 0 ; GFX12-NEXT: s_mov_b32 s5, s20 ; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s6 -; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_mov_b32 s6, s21 ; GFX12-NEXT: s_add_co_i32 s7, s1, s7 ; GFX12-NEXT: s_mov_b32 s1, s16 ; GFX12-NEXT: ; return to shader part epilog @@ -2037,208 +2303,244 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v16, v0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX7-NEXT: v_mov_b32_e32 v17, v1 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v18, v23 -; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX7-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-NEXT: v_mov_b32_e32 v1, v23 -; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v22 -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v22, s[6:7], 0, v22, s[6:7] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17] +; GFX7-NEXT: v_mov_b32_e32 v16, v19 +; GFX7-NEXT: v_mov_b32_e32 v17, v20 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v25, 0, 1, s[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20] +; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v25, s[10:11], 0, v25, s[10:11] +; GFX7-NEXT: v_add_i32_e64 v23, s[14:15], v23, v19 +; GFX7-NEXT: v_add_i32_e64 v24, s[16:17], v22, v20 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0 +; GFX7-NEXT: v_addc_u32_e64 v25, s[12:13], 0, v25, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20] +; GFX7-NEXT: v_addc_u32_e64 v25, s[6:7], 0, v25, s[6:7] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20] +; GFX7-NEXT: v_mov_b32_e32 v22, v19 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22] +; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21] +; GFX7-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v19, s[4:5], 0, v19, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v19, s[8:9], 0, v19, s[8:9] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22] ; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX7-NEXT: v_mul_lo_u32 v11, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22] +; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22] +; GFX7-NEXT: v_add_i32_e64 v21, s[6:7], v25, v21 +; GFX7-NEXT: v_add_i32_e64 v19, s[20:21], v19, v22 +; GFX7-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[22:23] +; GFX7-NEXT: v_add_i32_e64 v3, s[22:23], v23, v3 +; GFX7-NEXT: s_or_b64 s[14:15], s[14:15], s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[14:15] +; GFX7-NEXT: v_add_i32_e64 v4, s[14:15], v24, v4 +; GFX7-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[14:15] +; GFX7-NEXT: v_add_i32_e64 v5, s[14:15], v21, v5 +; GFX7-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] +; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX7-NEXT: v_add_i32_e64 v6, s[6:7], v19, v6 +; GFX7-NEXT: s_or_b64 s[6:7], s[20:21], s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v20, v0, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v1, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v2, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v12, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v11, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v25, vcc +; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v22, s[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v16 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v16, v0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v18, v23 -; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, v23 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v22 -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v22, s[6:7], 0, v22, s[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17] +; GFX8-NEXT: v_mov_b32_e32 v16, v19 +; GFX8-NEXT: v_mov_b32_e32 v17, v20 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v25, 0, 1, s[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20] +; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v25, s[10:11], 0, v25, s[10:11] +; GFX8-NEXT: v_add_u32_e64 v23, s[14:15], v23, v19 +; GFX8-NEXT: v_add_u32_e64 v24, s[16:17], v22, v20 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0 +; GFX8-NEXT: v_addc_u32_e64 v25, s[12:13], 0, v25, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20] +; GFX8-NEXT: v_addc_u32_e64 v25, s[6:7], 0, v25, s[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20] +; GFX8-NEXT: v_mov_b32_e32 v22, v19 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22] +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21] +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v19, s[4:5], 0, v19, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v19, s[8:9], 0, v19, s[8:9] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22] ; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX8-NEXT: v_mul_lo_u32 v11, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22] +; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22] +; GFX8-NEXT: v_add_u32_e64 v21, s[6:7], v25, v21 +; GFX8-NEXT: v_add_u32_e64 v19, s[20:21], v19, v22 +; GFX8-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[22:23] +; GFX8-NEXT: v_add_u32_e64 v3, s[22:23], v23, v3 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[14:15] +; GFX8-NEXT: v_add_u32_e64 v4, s[14:15], v24, v4 +; GFX8-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[14:15] +; GFX8-NEXT: v_add_u32_e64 v5, s[14:15], v21, v5 +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] +; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX8-NEXT: v_add_u32_e64 v6, s[6:7], v19, v6 +; GFX8-NEXT: s_or_b64 s[6:7], s[20:21], s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v20, v0, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v1, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v2, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v12, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v11, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v25, vcc +; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v22, s[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v16 +; GFX8-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-NEXT: v_mov_b32_e32 v2, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v18, v23 -; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, v20 -; GFX9-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1] -; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v22, s[6:7], 0, v22, s[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17] +; GFX9-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-NEXT: v_mov_b32_e32 v17, v20 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, s[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[10:11], 0, v25, s[10:11] +; GFX9-NEXT: v_add_co_u32_e64 v23, s[14:15], v23, v19 +; GFX9-NEXT: v_add_co_u32_e64 v24, s[16:17], v22, v20 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[12:13], 0, v25, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e64 v25, s[6:7], 0, v25, s[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20] +; GFX9-NEXT: v_mov_b32_e32 v22, v19 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22] +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[20:21] +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[4:5], 0, v19, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[8:9], 0, v19, s[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22] ; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15 -; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10] +; GFX9-NEXT: v_mul_lo_u32 v11, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22] +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22] +; GFX9-NEXT: v_add_co_u32_e64 v21, s[6:7], v25, v21 +; GFX9-NEXT: v_add_co_u32_e64 v19, s[20:21], v19, v22 +; GFX9-NEXT: v_mul_lo_u32 v22, v6, v9 +; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[22:23] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[22:23], v23, v3 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[14:15] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[14:15], v24, v4 +; GFX9-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[14:15] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[14:15], v21, v5 +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[14:15] +; GFX9-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[6:7], v19, v6 +; GFX9-NEXT: s_or_b64 s[6:7], s[20:21], s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v20, v0, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v1, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v2, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v12, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v11, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v25, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v22, s[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2246,69 +2548,82 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 +; GFX10-NEXT: v_mov_b32_e32 v18, v2 ; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX10-NEXT: v_mul_lo_u32 v28, v4, v11 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s5, v16, v10, 0 +; GFX10-NEXT: v_mul_lo_u32 v29, v5, v10 +; GFX10-NEXT: v_mul_lo_u32 v14, v17, v14 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s4, v17, v11, v[19:20] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s4, v17, v9, v[21:22] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20] ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s4, v18, v8, v[21:22] +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e64 v25, s4, 0, v25, s4 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], s5, v6, v8, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v22 +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v1, v19 +; GFX10-NEXT: v_mov_b32_e32 v19, v20 +; GFX10-NEXT: v_mov_b32_e32 v20, v23 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v16, v11, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_mul_lo_u32 v13, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v10, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v2, s4 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s5, v17, v12, v[19:20] +; GFX10-NEXT: v_mov_b32_e32 v2, v21 +; GFX10-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v18, v9, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 +; GFX10-NEXT: v_mad_u64_u32 v[19:20], s6, v18, v11, v[19:20] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s7, v16, v9, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s4, v3, v8, v[22:23] +; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s7 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s7, v3, v10, v[19:20] +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s4, 0, v6, s4 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v17, v8, v[1:2] +; GFX10-NEXT: v_add_co_u32 v3, s8, v23, v21 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v4, v9, v[10:11] +; GFX10-NEXT: v_add_co_u32 v4, s10, v25, v22 +; GFX10-NEXT: v_add_co_u32 v3, s9, v3, v6 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[9:10] +; GFX10-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX10-NEXT: v_add_co_u32 v4, s9, v4, v11 +; GFX10-NEXT: s_or_b32 s9, s10, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s9 +; GFX10-NEXT: v_add_co_u32 v5, s9, v19, v5 +; GFX10-NEXT: v_add_co_u32 v6, s11, v26, v6 +; GFX10-NEXT: v_add_co_u32 v5, s10, v5, v9 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s9 +; GFX10-NEXT: v_add_co_u32 v6, s9, v6, v9 +; GFX10-NEXT: s_or_b32 s9, s11, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v24, v10, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v14, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v13, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v12, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v29, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2316,69 +2631,81 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX11-NEXT: v_mov_b32_e32 v18, v2 ; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0 -; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], null, v16, v10, 0 +; GFX11-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX11-NEXT: v_mul_lo_u32 v29, v5, v10 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s0, v17, v11, v[19:20] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s0, v17, v9, v[21:22] +; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_mul_lo_u32 v14, v17, v14 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v20, v22 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX11-NEXT: v_mov_b32_e32 v20, v18 -; GFX11-NEXT: v_mov_b32_e32 v19, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX11-NEXT: v_mad_u64_u32 v[23:24], null, v6, v8, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, v19 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s0, v18, v8, v[21:22] +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, s0, 0, v25, s0 +; GFX11-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v23 +; GFX11-NEXT: v_mov_b32_e32 v0, v22 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20] +; GFX11-NEXT: v_mul_lo_u32 v13, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s0, v16, v11, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0 -; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX11-NEXT: v_mov_b32_e32 v14, v21 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19] -; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX11-NEXT: v_mov_b32_e32 v13, v1 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12] -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s1, v17, v12, v[19:20] +; GFX11-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s0, v17, v10, v[22:23] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, v21 +; GFX11-NEXT: v_mad_u64_u32 v[19:20], s2, v18, v11, v[19:20] +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s0, v18, v9, v[22:23] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s3, v16, v9, v[1:2] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s0, v3, v8, v[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v23, 0, 1, s3 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s3, v3, v10, v[19:20] +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v6, s0 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v17, v8, v[1:2] +; GFX11-NEXT: v_add_co_u32 v3, s4, v23, v21 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], s0, v4, v9, v[10:11] +; GFX11-NEXT: v_add_co_u32 v4, s6, v25, v22 +; GFX11-NEXT: v_add_co_u32 v3, s5, v3, v6 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[9:10] +; GFX11-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX11-NEXT: v_add_co_u32 v4, s5, v4, v11 +; GFX11-NEXT: s_or_b32 s5, s6, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX11-NEXT: v_add_co_u32 v5, s5, v19, v5 +; GFX11-NEXT: v_add_co_u32 v6, s7, v26, v6 +; GFX11-NEXT: v_add_co_u32 v5, s6, v5, v9 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX11-NEXT: v_add_co_u32 v6, s5, v6, v9 +; GFX11-NEXT: s_or_b32 s5, s7, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v24, v10, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v14, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v13, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v12, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v28, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v29, s1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo ; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2390,90 +2717,104 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: v_mov_b32_e32 v18, v2 ; GFX12-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], null, v16, v12, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], null, v16, v10, 0 +; GFX12-NEXT: v_mul_lo_u32 v28, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v29, v5, v10 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], s0, v17, v11, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s0, v17, v9, v[21:22] +; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX12-NEXT: v_mul_lo_u32 v14, v17, v14 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v20, v22 -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], null, v6, v8, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v1, v19 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s0, v18, v8, v[21:22] +; GFX12-NEXT: v_add_co_ci_u32_e64 v25, s0, 0, v25, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v23 +; GFX12-NEXT: v_mov_b32_e32 v0, v22 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20] +; GFX12-NEXT: v_mul_lo_u32 v13, v18, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v16, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 -; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX12-NEXT: v_mov_b32_e32 v13, v1 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX12-NEXT: v_mov_b32_e32 v14, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] -; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], s1, v17, v12, v[19:20] +; GFX12-NEXT: v_mul_lo_u32 v12, v3, v12 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v10, v[22:23] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, v21 +; GFX12-NEXT: v_mad_co_u64_u32 v[19:20], s2, v18, v11, v[19:20] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v18, v9, v[22:23] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s3, v16, v9, v[1:2] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s0, v3, v8, v[22:23] +; GFX12-NEXT: v_cndmask_b32_e64 v23, 0, 1, s3 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v6, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v17, v8, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_u32 v3, s4, v23, v21 +; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s0, v4, v9, v[10:11] +; GFX12-NEXT: v_add_co_u32 v4, s6, v25, v22 +; GFX12-NEXT: v_add_co_u32 v3, s5, v3, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s4, s4, s5 +; GFX12-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[9:10] +; GFX12-NEXT: v_mul_lo_u32 v10, v16, v15 +; GFX12-NEXT: v_add_co_u32 v4, s5, v4, v11 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s5, s6, s5 +; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v5, s5, v19, v5 +; GFX12-NEXT: v_add_co_u32 v6, s7, v26, v6 +; GFX12-NEXT: v_add_co_u32 v5, s6, v5, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s5, s5, s6 +; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: v_add_co_u32 v6, s5, v6, v9 +; GFX12-NEXT: s_or_b32 s5, s7, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v24, v10, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v14, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v13, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v12, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v28, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s0, v9, v29, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo ; GFX12-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 1821d29d4b050..ae6bcb6b08202 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2618,10 +2618,13 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2630,24 +2633,45 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i48: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v2 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v2 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, s4 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v2 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v1, s1, v1, v2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) ret i48 %result } @@ -2677,7 +2701,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: ; return to shader part epilog @@ -2687,7 +2718,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: ; return to shader part epilog @@ -2697,7 +2735,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s2 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s2, s3, s2 +; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -2728,11 +2773,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2740,11 +2787,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2752,10 +2801,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, s1, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) @@ -2787,11 +2839,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2799,11 +2853,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2811,10 +2867,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, v0, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, v1, s1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) @@ -2827,38 +2886,62 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-LABEL: v_uaddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_uaddsat_i64: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_uaddsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v2 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v2 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v2 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v1, s1, v1, v2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } @@ -2867,28 +2950,56 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s2 -; GFX6-NEXT: s_addc_u32 s1, s1, s3 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s3 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s2 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_or_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s2 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s2 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s2 +; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s2, s3, s2 +; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) @@ -2898,37 +3009,46 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-LABEL: uaddsat_i64_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v2 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i64_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i64_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: uaddsat_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, s1, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -2938,37 +3058,46 @@ define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-LABEL: uaddsat_i64_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v2 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i64_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v2 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i64_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: uaddsat_i64_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, v0, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, v1, s1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -2979,51 +3108,75 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-LABEL: v_uaddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v0, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v1, v4 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, s[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[4:5], v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v4 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v6 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v4 +; GFX10-NEXT: s_or_b32 s4, s4, s6 +; GFX10-NEXT: v_add_co_u32 v3, s7, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, s4 +; GFX10-NEXT: s_or_b32 s4, s5, s7 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3031,12 +3184,18 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-LABEL: v_uaddsat_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v6 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v4 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v2, s2, v2, v6 +; GFX11-NEXT: v_add_co_u32 v3, s1, v3, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX11-NEXT: v_add_co_u32 v1, s2, v1, v4 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: v_add_co_u32 v3, s3, v3, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, s0 +; GFX11-NEXT: s_or_b32 s0, s1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3048,40 +3207,96 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_uaddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_add_u32 s2, s2, s6 -; GFX6-NEXT: s_addc_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_add_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_add_u32 s3, s3, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_add_u32 s2, s2, s6 -; GFX8-NEXT: s_addc_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_add_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_add_u32 s3, s3, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_add_u32 s2, s2, s6 -; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_add_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_add_u32 s3, s3, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_v2i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6 -; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s3, s3, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -3092,8 +3307,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s5 -; GFX6-NEXT: s_addc_u32 s2, s2, s6 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_add_u32 s2, s2, s6 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_add_u32 s2, s2, s4 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 1 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 ; GFX6-NEXT: s_addc_u32 s3, s3, s7 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3102,8 +3329,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-LABEL: s_uaddsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: s_addc_u32 s2, s2, s6 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_add_u32 s2, s2, s6 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_add_u32 s2, s2, s4 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3112,8 +3351,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-LABEL: s_uaddsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_addc_u32 s2, s2, s6 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3122,8 +3373,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10PLUS-LABEL: s_uaddsat_i128: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 -; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s6 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6 +; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s4, s5, s4 +; GFX10PLUS-NEXT: s_and_b32 s4, s4, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] @@ -3135,13 +3398,17 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-LABEL: uaddsat_i128_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v4 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], s2, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3150,13 +3417,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX8-LABEL: uaddsat_i128_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v4 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], s2, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3165,13 +3436,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX9-LABEL: uaddsat_i128_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v4 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[4:5], s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3180,9 +3455,15 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX10PLUS-LABEL: uaddsat_i128_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, s1, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s2, s2, v2 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v4 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s0, v2, v4 +; GFX10PLUS-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo @@ -3197,13 +3478,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX6-LABEL: uaddsat_i128_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v1, s[0:1], v1, v4 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], s2, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3212,13 +3497,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i128_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s1, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], v1, v4 +; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], s2, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3227,13 +3516,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX9-LABEL: uaddsat_i128_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s1, v1 +; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v1, v4 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[4:5], s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3242,9 +3535,15 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX10PLUS-LABEL: uaddsat_i128_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, s0, v0, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s1, v1, s1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s2, v2, s2 +; GFX10PLUS-NEXT: v_add_co_u32 v1, s0, v1, v4 +; GFX10PLUS-NEXT: s_or_b32 s0, s1, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10PLUS-NEXT: v_add_co_u32 v2, s0, v2, v4 +; GFX10PLUS-NEXT: s_or_b32 vcc_lo, s2, s0 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo @@ -3260,17 +3559,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-LABEL: v_uaddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v0, v8 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], v2, v10 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GFX6-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GFX6-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v6, s[6:7], v6, v14 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc @@ -3281,17 +3592,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v8 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v8 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v12 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v8 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v6, s[6:7], v6, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc @@ -3302,17 +3625,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-LABEL: v_uaddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v9, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v10, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v8 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v1, s[4:5], v1, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v8 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[6:7], v6, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 +; GFX9-NEXT: s_or_b64 vcc, s[6:7], vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc @@ -3323,18 +3658,30 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-LABEL: v_uaddsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 -; GFX10-NEXT: v_add_co_u32 v4, s4, v4, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, v5, v13, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, v6, v14, s4 +; GFX10-NEXT: v_add_co_u32 v0, s5, v0, v8 +; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v10 +; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v12 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v8 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_add_co_u32 v5, s5, v5, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7 +; GFX10-NEXT: s_or_b32 vcc_lo, s6, s4 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v14 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4 +; GFX10-NEXT: v_add_co_u32 v5, s4, v5, v8 +; GFX10-NEXT: s_or_b32 s4, s5, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v8 +; GFX10-NEXT: s_or_b32 s4, s6, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, -1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, -1, s4 @@ -3344,18 +3691,30 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-LABEL: v_uaddsat_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 -; GFX11-NEXT: v_add_co_u32 v4, s0, v4, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v5, v13, s0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v6, v14, s0 +; GFX11-NEXT: v_add_co_u32 v0, s1, v0, v8 +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, v9 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX11-NEXT: v_add_co_u32 v2, s2, v2, v10 +; GFX11-NEXT: v_add_co_u32 v4, s3, v4, v12 +; GFX11-NEXT: v_add_co_u32 v1, s1, v1, v8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_add_co_u32 v5, s1, v5, v13 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3 +; GFX11-NEXT: s_or_b32 vcc_lo, s2, s0 +; GFX11-NEXT: v_add_co_u32 v6, s2, v6, v14 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0 +; GFX11-NEXT: v_add_co_u32 v5, s0, v5, v8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, s0, v6, v8 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, -1, s0 @@ -3369,14 +3728,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_uaddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s9 -; GFX6-NEXT: s_addc_u32 s2, s2, s10 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s9 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_add_u32 s1, s1, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_add_u32 s2, s2, s10 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_addc_u32 s3, s3, s11 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: s_add_u32 s4, s4, s12 -; GFX6-NEXT: s_addc_u32 s5, s5, s13 -; GFX6-NEXT: s_addc_u32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_add_u32 s5, s5, s13 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_add_u32 s5, s5, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_add_u32 s6, s6, s14 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_add_u32 s6, s6, s8 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_addc_u32 s7, s7, s15 ; GFX6-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] @@ -3385,14 +3768,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-LABEL: s_uaddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s8 -; GFX8-NEXT: s_addc_u32 s1, s1, s9 -; GFX8-NEXT: s_addc_u32 s2, s2, s10 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s9 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_add_u32 s1, s1, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_add_u32 s2, s2, s10 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_add_u32 s2, s2, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: s_add_u32 s4, s4, s12 -; GFX8-NEXT: s_addc_u32 s5, s5, s13 -; GFX8-NEXT: s_addc_u32 s6, s6, s14 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_add_u32 s5, s5, s13 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_add_u32 s5, s5, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_add_u32 s6, s6, s14 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_add_u32 s6, s6, s8 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_or_b32 s8, s9, s8 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: s_addc_u32 s7, s7, s15 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX8-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] @@ -3401,14 +3808,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-LABEL: s_uaddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s9 -; GFX9-NEXT: s_addc_u32 s2, s2, s10 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s9 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_add_u32 s1, s1, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_add_u32 s2, s2, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: s_add_u32 s4, s4, s12 -; GFX9-NEXT: s_addc_u32 s5, s5, s13 -; GFX9-NEXT: s_addc_u32 s6, s6, s14 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_add_u32 s5, s5, s13 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s6, s6, s14 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_add_u32 s6, s6, s8 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: s_addc_u32 s7, s7, s15 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX9-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] @@ -3417,14 +3848,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10PLUS-LABEL: s_uaddsat_v2i128: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9 -; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s1, s1, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12 -; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13 -; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s5, s5, s13 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s5, s5, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_add_u32 s6, s6, s14 +; GFX10PLUS-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_add_u32 s6, s6, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10PLUS-NEXT: s_or_b32 s8, s9, s8 +; GFX10PLUS-NEXT: s_and_b32 s8, s8, 1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] ; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] From b8f3e544e214e8b9976f05e9a7f23a2e02860de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Fri, 23 Feb 2024 14:29:41 +0100 Subject: [PATCH 351/351] address review comments --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 2 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 66 ++++++++++--------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index abc2ebdfd878c..1eb5fa93c0dfd 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -935,7 +935,7 @@ class CombinerHelper { // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y. bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo); - bool isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const; + bool isZExtOrTruncLegalOrBeforeLegalizer(LLT DstTy, LLT SrcTy) const; }; } // namespace llvm diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 96cc6e8c06c1d..d7411a24d7c5a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6936,15 +6936,16 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) { return false; } -bool CombinerHelper::isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const { +bool CombinerHelper::isZExtOrTruncLegalOrBeforeLegalizer(LLT DstTy, + LLT SrcTy) const { // Copy. - if (ToTy == FromTy) + if (DstTy == SrcTy) return true; - if (isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {ToTy, FromTy}})) + if (isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) return true; - if (isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {ToTy, FromTy}})) + if (isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) return true; return false; @@ -6963,16 +6964,18 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, bool IsSigned = Add->isSigned(); LLT DstTy = MRI.getType(Dst); LLT CarryTy = MRI.getType(Carry); - LLT OperandTy = MRI.getType(LHS); - LLT CarryInTy = MRI.getType(CarryIn); // FIXME: handle undef - // fold sadde, if the carry is dead -> add(add(LHS, RHS), - // zextOrTrunc(CarryIn)), undef. - if (MRI.use_nodbg_empty(Carry) && IsSigned && MRI.hasOneNonDBGUse(Dst) && + // We want do fold the [u|s]adde. + if (!MRI.hasOneNonDBGUse(Dst)) + return false; + + // Fold sadde, if the carry is dead -> + // add(add(LHS, RHS), zextOrTrunc(CarryIn)), undef. + if (MRI.use_nodbg_empty(Carry) && IsSigned && isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) && - isZExtOrTruncLegal(DstTy, CarryInTy)) { + isZExtOrTruncLegalOrBeforeLegalizer(DstTy, CarryTy)) { MatchInfo = [=](MachineIRBuilder &B) { auto A = B.buildAdd(DstTy, LHS, RHS); Register AReg = A.getReg(0); @@ -6985,7 +6988,7 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, } // We want do fold the [u|s]adde. - if (!MRI.hasOneNonDBGUse(Dst) || !MRI.hasOneNonDBGUse(Carry)) + if (!MRI.hasOneNonDBGUse(Carry)) return false; // The parameters of the adde must be integer-like. @@ -6993,7 +6996,7 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, std::optional MaybeRHS = getConstantOrConstantSplatVector(RHS); std::optional MaybeCarryIn = getConstantOrConstantSplatVector(CarryIn); - // fold adde(c, c, c) -> c, carry + // Fold adde(c1, c2, c3) -> c4, carry if (MaybeLHS && MaybeRHS && MaybeCarryIn && isConstantLegalOrBeforeLegalizer(DstTy) && isConstantLegalOrBeforeLegalizer(CarryTy)) { @@ -7015,7 +7018,7 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, B.buildConstant(Carry, FirstOverflowed | SecondOverflowed); }; return true; - } else if (!IsSigned) { + } else { APInt LHS = MaybeLHS->zext(BitWidth); APInt RHS = MaybeRHS->zext(BitWidth); APInt CarryIn = MaybeCarryIn->zext(BitWidth); @@ -7031,7 +7034,7 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, } } - // canonicalize constant to RHS. + // Canonicalize constant to RHS. if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) { if (IsSigned) { MatchInfo = [=](MachineIRBuilder &B) { @@ -7046,41 +7049,40 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, } } - // fold adde(LHS, RHS, 0) -> addo(LHS, RHS) + // Fold adde(LHS, RHS, 0) -> addo(LHS, RHS) if (MaybeCarryIn && *MaybeCarryIn == 0) { if (IsSigned && isLegalOrBeforeLegalizer( - {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}})) { + {TargetOpcode::G_SADDO, {DstTy, CarryTy, DstTy}})) { MatchInfo = [=](MachineIRBuilder &B) { B.buildSAddo(Dst, Carry, LHS, RHS); }; return true; - } else if (!IsSigned && - isLegalOrBeforeLegalizer( - {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}})) + } else if (!IsSigned && isLegalOrBeforeLegalizer({TargetOpcode::G_UADDO, + {DstTy, CarryTy, DstTy}})) MatchInfo = [=](MachineIRBuilder &B) { B.buildUAddo(Dst, Carry, LHS, RHS); }; return true; } - // fold adde(LHS, 0, Carry) -> addo(LHS, Carry) + // Fold adde(LHS, 0, Carry) -> addo(LHS, Carry) if (MaybeRHS && *MaybeRHS == 0) { if (IsSigned && isLegalOrBeforeLegalizer( - {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) && - isZExtOrTruncLegal(OperandTy, CarryInTy)) { + {TargetOpcode::G_SADDO, {DstTy, CarryTy, DstTy}}) && + isZExtOrTruncLegalOrBeforeLegalizer(DstTy, CarryTy)) { MatchInfo = [=](MachineIRBuilder &B) { - auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn); Register ZextCarryInReg = ZextCarryIn.getReg(0); B.buildSAddo(Dst, Carry, LHS, ZextCarryInReg); }; return true; } else if (!IsSigned && isLegalOrBeforeLegalizer( - {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) && - isZExtOrTruncLegal(OperandTy, CarryInTy)) { + {TargetOpcode::G_UADDO, {DstTy, CarryTy, DstTy}}) && + isZExtOrTruncLegalOrBeforeLegalizer(DstTy, CarryTy)) { MatchInfo = [=](MachineIRBuilder &B) { - auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn); Register ZextCarryInReg = ZextCarryIn.getReg(0); B.buildUAddo(Dst, Carry, LHS, ZextCarryInReg); }; @@ -7091,14 +7093,14 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, // We lower to 2*addo + 1*or. if (IsSigned && isLegalOrBeforeLegalizer( - {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) && + {TargetOpcode::G_SADDO, {DstTy, CarryTy, DstTy}}) && isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) && - isZExtOrTruncLegal(OperandTy, CarryInTy)) { + isZExtOrTruncLegalOrBeforeLegalizer(DstTy, CarryTy)) { MatchInfo = [=](MachineIRBuilder &B) { auto First = B.buildSAddo(DstTy, CarryTy, LHS, RHS); Register FirstResult = First.getReg(0); Register FirstCarry = First.getReg(1); - auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn); auto Second = B.buildSAddo(DstTy, CarryTy, FirstResult, ZextCarryIn); Register Result = Second.getReg(0); Register SecondCarry = Second.getReg(1); @@ -7108,14 +7110,14 @@ bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI, return true; } else if (!IsSigned && isLegalOrBeforeLegalizer( - {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) && + {TargetOpcode::G_UADDO, {DstTy, CarryTy, DstTy}}) && isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) && - isZExtOrTruncLegal(OperandTy, CarryInTy)) { + isZExtOrTruncLegalOrBeforeLegalizer(DstTy, CarryTy)) { MatchInfo = [=](MachineIRBuilder &B) { auto First = B.buildUAddo(DstTy, CarryTy, LHS, RHS); Register FirstResult = First.getReg(0); Register FirstCarry = First.getReg(1); - auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn); + auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn); auto Second = B.buildUAddo(DstTy, CarryTy, FirstResult, ZextCarryIn); Register Result = Second.getReg(0); Register SecondCarry = Second.getReg(1);